gallivm: fix somewhat broken NaN behavior for exp2
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
69 #endif
70
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
73 #endif
74
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
77 #endif
78
79 #define EXP_POLY_DEGREE 5
80
81 #define LOG_POLY_DEGREE 4
82
83
84 /**
85 * Generate min(a, b)
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
89 */
90 static LLVMValueRef
91 lp_build_min_simple(struct lp_build_context *bld,
92 LLVMValueRef a,
93 LLVMValueRef b,
94 enum gallivm_nan_behavior nan_behavior)
95 {
96 const struct lp_type type = bld->type;
97 const char *intrinsic = NULL;
98 unsigned intr_size = 0;
99 LLVMValueRef cond;
100
101 assert(lp_check_value(type, a));
102 assert(lp_check_value(type, b));
103
104 /* TODO: optimize the constant case */
105
106 if (type.floating && util_cpu_caps.has_sse) {
107 if (type.width == 32) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse.min.ss";
110 intr_size = 128;
111 }
112 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse.min.ps";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.ps.256";
118 intr_size = 256;
119 }
120 }
121 if (type.width == 64 && util_cpu_caps.has_sse2) {
122 if (type.length == 1) {
123 intrinsic = "llvm.x86.sse2.min.sd";
124 intr_size = 128;
125 }
126 else if (type.length == 2 || !util_cpu_caps.has_avx) {
127 intrinsic = "llvm.x86.sse2.min.pd";
128 intr_size = 128;
129 }
130 else {
131 intrinsic = "llvm.x86.avx.min.pd.256";
132 intr_size = 256;
133 }
134 }
135 }
136 else if (type.floating && util_cpu_caps.has_altivec) {
137 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
138 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
147 intr_size = 128;
148 if ((type.width == 8 || type.width == 16) &&
149 (type.width * type.length <= 64) &&
150 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
151 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
152 __FUNCTION__);
153 }
154 if (type.width == 8 && !type.sign) {
155 intrinsic = "llvm.x86.sse2.pminu.b";
156 }
157 else if (type.width == 16 && type.sign) {
158 intrinsic = "llvm.x86.sse2.pmins.w";
159 }
160 if (util_cpu_caps.has_sse4_1) {
161 if (type.width == 8 && type.sign) {
162 intrinsic = "llvm.x86.sse41.pminsb";
163 }
164 if (type.width == 16 && !type.sign) {
165 intrinsic = "llvm.x86.sse41.pminuw";
166 }
167 if (type.width == 32 && !type.sign) {
168 intrinsic = "llvm.x86.sse41.pminud";
169 }
170 if (type.width == 32 && type.sign) {
171 intrinsic = "llvm.x86.sse41.pminsd";
172 }
173 }
174 } else if (util_cpu_caps.has_altivec) {
175 intr_size = 128;
176 if (type.width == 8) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminub";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsb";
181 }
182 } else if (type.width == 16) {
183 if (!type.sign) {
184 intrinsic = "llvm.ppc.altivec.vminuh";
185 } else {
186 intrinsic = "llvm.ppc.altivec.vminsh";
187 }
188 } else if (type.width == 32) {
189 if (!type.sign) {
190 intrinsic = "llvm.ppc.altivec.vminuw";
191 } else {
192 intrinsic = "llvm.ppc.altivec.vminsw";
193 }
194 }
195 }
196
197 if(intrinsic) {
198 /* We need to handle nan's for floating point numbers. If one of the
199 * inputs is nan the other should be returned (required by both D3D10+
200 * and OpenCL).
201 * The sse intrinsics return the second operator in case of nan by
202 * default so we need to special code to handle those.
203 */
204 if (util_cpu_caps.has_sse && type.floating &&
205 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
206 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
207 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
208 LLVMValueRef isnan, max;
209 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
210 type,
211 intr_size, a, b);
212 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
213 isnan = lp_build_isnan(bld, b);
214 return lp_build_select(bld, isnan, a, max);
215 } else {
216 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
217 isnan = lp_build_isnan(bld, a);
218 return lp_build_select(bld, isnan, a, max);
219 }
220 } else {
221 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
222 type,
223 intr_size, a, b);
224 }
225 }
226
227 if (type.floating) {
228 switch (nan_behavior) {
229 case GALLIVM_NAN_RETURN_NAN: {
230 LLVMValueRef isnan = lp_build_isnan(bld, b);
231 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
232 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
233 return lp_build_select(bld, cond, a, b);
234 }
235 break;
236 case GALLIVM_NAN_RETURN_OTHER: {
237 LLVMValueRef isnan = lp_build_isnan(bld, a);
238 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
239 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
240 return lp_build_select(bld, cond, a, b);
241 }
242 break;
243 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
244 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
245 return lp_build_select(bld, cond, a, b);
246 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
247 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
248 return lp_build_select(bld, cond, b, a);
249 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 break;
253 default:
254 assert(0);
255 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
256 return lp_build_select(bld, cond, a, b);
257 }
258 } else {
259 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
260 return lp_build_select(bld, cond, a, b);
261 }
262 }
263
264
265 /**
266 * Generate max(a, b)
267 * No checks for special case values of a or b = 1 or 0 are done.
268 * NaN's are handled according to the behavior specified by the
269 * nan_behavior argument.
270 */
271 static LLVMValueRef
272 lp_build_max_simple(struct lp_build_context *bld,
273 LLVMValueRef a,
274 LLVMValueRef b,
275 enum gallivm_nan_behavior nan_behavior)
276 {
277 const struct lp_type type = bld->type;
278 const char *intrinsic = NULL;
279 unsigned intr_size = 0;
280 LLVMValueRef cond;
281
282 assert(lp_check_value(type, a));
283 assert(lp_check_value(type, b));
284
285 /* TODO: optimize the constant case */
286
287 if (type.floating && util_cpu_caps.has_sse) {
288 if (type.width == 32) {
289 if (type.length == 1) {
290 intrinsic = "llvm.x86.sse.max.ss";
291 intr_size = 128;
292 }
293 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
294 intrinsic = "llvm.x86.sse.max.ps";
295 intr_size = 128;
296 }
297 else {
298 intrinsic = "llvm.x86.avx.max.ps.256";
299 intr_size = 256;
300 }
301 }
302 if (type.width == 64 && util_cpu_caps.has_sse2) {
303 if (type.length == 1) {
304 intrinsic = "llvm.x86.sse2.max.sd";
305 intr_size = 128;
306 }
307 else if (type.length == 2 || !util_cpu_caps.has_avx) {
308 intrinsic = "llvm.x86.sse2.max.pd";
309 intr_size = 128;
310 }
311 else {
312 intrinsic = "llvm.x86.avx.max.pd.256";
313 intr_size = 256;
314 }
315 }
316 }
317 else if (type.floating && util_cpu_caps.has_altivec) {
318 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
319 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
320 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
321 __FUNCTION__);
322 }
323 if (type.width == 32 || type.length == 4) {
324 intrinsic = "llvm.ppc.altivec.vmaxfp";
325 intr_size = 128;
326 }
327 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
328 intr_size = 128;
329 if ((type.width == 8 || type.width == 16) &&
330 (type.width * type.length <= 64) &&
331 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
332 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
333 __FUNCTION__);
334 }
335 if (type.width == 8 && !type.sign) {
336 intrinsic = "llvm.x86.sse2.pmaxu.b";
337 intr_size = 128;
338 }
339 else if (type.width == 16 && type.sign) {
340 intrinsic = "llvm.x86.sse2.pmaxs.w";
341 }
342 if (util_cpu_caps.has_sse4_1) {
343 if (type.width == 8 && type.sign) {
344 intrinsic = "llvm.x86.sse41.pmaxsb";
345 }
346 if (type.width == 16 && !type.sign) {
347 intrinsic = "llvm.x86.sse41.pmaxuw";
348 }
349 if (type.width == 32 && !type.sign) {
350 intrinsic = "llvm.x86.sse41.pmaxud";
351 }
352 if (type.width == 32 && type.sign) {
353 intrinsic = "llvm.x86.sse41.pmaxsd";
354 }
355 }
356 } else if (util_cpu_caps.has_altivec) {
357 intr_size = 128;
358 if (type.width == 8) {
359 if (!type.sign) {
360 intrinsic = "llvm.ppc.altivec.vmaxub";
361 } else {
362 intrinsic = "llvm.ppc.altivec.vmaxsb";
363 }
364 } else if (type.width == 16) {
365 if (!type.sign) {
366 intrinsic = "llvm.ppc.altivec.vmaxuh";
367 } else {
368 intrinsic = "llvm.ppc.altivec.vmaxsh";
369 }
370 } else if (type.width == 32) {
371 if (!type.sign) {
372 intrinsic = "llvm.ppc.altivec.vmaxuw";
373 } else {
374 intrinsic = "llvm.ppc.altivec.vmaxsw";
375 }
376 }
377 }
378
379 if(intrinsic) {
380 if (util_cpu_caps.has_sse && type.floating &&
381 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
382 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
383 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
384 LLVMValueRef isnan, min;
385 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
386 type,
387 intr_size, a, b);
388 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
389 isnan = lp_build_isnan(bld, b);
390 return lp_build_select(bld, isnan, a, min);
391 } else {
392 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
393 isnan = lp_build_isnan(bld, a);
394 return lp_build_select(bld, isnan, a, min);
395 }
396 } else {
397 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
398 type,
399 intr_size, a, b);
400 }
401 }
402
403 if (type.floating) {
404 switch (nan_behavior) {
405 case GALLIVM_NAN_RETURN_NAN: {
406 LLVMValueRef isnan = lp_build_isnan(bld, b);
407 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
408 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
409 return lp_build_select(bld, cond, a, b);
410 }
411 break;
412 case GALLIVM_NAN_RETURN_OTHER: {
413 LLVMValueRef isnan = lp_build_isnan(bld, a);
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
416 return lp_build_select(bld, cond, a, b);
417 }
418 break;
419 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
420 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
421 return lp_build_select(bld, cond, a, b);
422 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
423 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
424 return lp_build_select(bld, cond, b, a);
425 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
426 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
427 return lp_build_select(bld, cond, a, b);
428 break;
429 default:
430 assert(0);
431 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
432 return lp_build_select(bld, cond, a, b);
433 }
434 } else {
435 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
436 return lp_build_select(bld, cond, a, b);
437 }
438 }
439
440
441 /**
442 * Generate 1 - a, or ~a depending on bld->type.
443 */
444 LLVMValueRef
445 lp_build_comp(struct lp_build_context *bld,
446 LLVMValueRef a)
447 {
448 LLVMBuilderRef builder = bld->gallivm->builder;
449 const struct lp_type type = bld->type;
450
451 assert(lp_check_value(type, a));
452
453 if(a == bld->one)
454 return bld->zero;
455 if(a == bld->zero)
456 return bld->one;
457
458 if(type.norm && !type.floating && !type.fixed && !type.sign) {
459 if(LLVMIsConstant(a))
460 return LLVMConstNot(a);
461 else
462 return LLVMBuildNot(builder, a, "");
463 }
464
465 if(LLVMIsConstant(a))
466 if (type.floating)
467 return LLVMConstFSub(bld->one, a);
468 else
469 return LLVMConstSub(bld->one, a);
470 else
471 if (type.floating)
472 return LLVMBuildFSub(builder, bld->one, a, "");
473 else
474 return LLVMBuildSub(builder, bld->one, a, "");
475 }
476
477
478 /**
479 * Generate a + b
480 */
481 LLVMValueRef
482 lp_build_add(struct lp_build_context *bld,
483 LLVMValueRef a,
484 LLVMValueRef b)
485 {
486 LLVMBuilderRef builder = bld->gallivm->builder;
487 const struct lp_type type = bld->type;
488 LLVMValueRef res;
489
490 assert(lp_check_value(type, a));
491 assert(lp_check_value(type, b));
492
493 if(a == bld->zero)
494 return b;
495 if(b == bld->zero)
496 return a;
497 if(a == bld->undef || b == bld->undef)
498 return bld->undef;
499
500 if(bld->type.norm) {
501 const char *intrinsic = NULL;
502
503 if(a == bld->one || b == bld->one)
504 return bld->one;
505
506 if (type.width * type.length == 128 &&
507 !type.floating && !type.fixed) {
508 if(util_cpu_caps.has_sse2) {
509 if(type.width == 8)
510 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
511 if(type.width == 16)
512 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
513 } else if (util_cpu_caps.has_altivec) {
514 if(type.width == 8)
515 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
516 if(type.width == 16)
517 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
518 }
519 }
520
521 if(intrinsic)
522 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
523 }
524
525 /* TODO: handle signed case */
526 if(type.norm && !type.floating && !type.fixed && !type.sign)
527 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
528
529 if(LLVMIsConstant(a) && LLVMIsConstant(b))
530 if (type.floating)
531 res = LLVMConstFAdd(a, b);
532 else
533 res = LLVMConstAdd(a, b);
534 else
535 if (type.floating)
536 res = LLVMBuildFAdd(builder, a, b, "");
537 else
538 res = LLVMBuildAdd(builder, a, b, "");
539
540 /* clamp to ceiling of 1.0 */
541 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
542 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
543
544 /* XXX clamp to floor of -1 or 0??? */
545
546 return res;
547 }
548
549
550 /** Return the scalar sum of the elements of a.
551 * Should avoid this operation whenever possible.
552 */
553 LLVMValueRef
554 lp_build_horizontal_add(struct lp_build_context *bld,
555 LLVMValueRef a)
556 {
557 LLVMBuilderRef builder = bld->gallivm->builder;
558 const struct lp_type type = bld->type;
559 LLVMValueRef index, res;
560 unsigned i, length;
561 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
562 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
563 LLVMValueRef vecres, elem2;
564
565 assert(lp_check_value(type, a));
566
567 if (type.length == 1) {
568 return a;
569 }
570
571 assert(!bld->type.norm);
572
573 /*
574 * for byte vectors can do much better with psadbw.
575 * Using repeated shuffle/adds here. Note with multiple vectors
576 * this can be done more efficiently as outlined in the intel
577 * optimization manual.
578 * Note: could cause data rearrangement if used with smaller element
579 * sizes.
580 */
581
582 vecres = a;
583 length = type.length / 2;
584 while (length > 1) {
585 LLVMValueRef vec1, vec2;
586 for (i = 0; i < length; i++) {
587 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
588 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
589 }
590 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
591 LLVMConstVector(shuffles1, length), "");
592 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
593 LLVMConstVector(shuffles2, length), "");
594 if (type.floating) {
595 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
596 }
597 else {
598 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
599 }
600 length = length >> 1;
601 }
602
603 /* always have vector of size 2 here */
604 assert(length == 1);
605
606 index = lp_build_const_int32(bld->gallivm, 0);
607 res = LLVMBuildExtractElement(builder, vecres, index, "");
608 index = lp_build_const_int32(bld->gallivm, 1);
609 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
610
611 if (type.floating)
612 res = LLVMBuildFAdd(builder, res, elem2, "");
613 else
614 res = LLVMBuildAdd(builder, res, elem2, "");
615
616 return res;
617 }
618
619 /**
620 * Return the horizontal sums of 4 float vectors as a float4 vector.
621 * This uses the technique as outlined in Intel Optimization Manual.
622 */
623 static LLVMValueRef
624 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
625 LLVMValueRef src[4])
626 {
627 struct gallivm_state *gallivm = bld->gallivm;
628 LLVMBuilderRef builder = gallivm->builder;
629 LLVMValueRef shuffles[4];
630 LLVMValueRef tmp[4];
631 LLVMValueRef sumtmp[2], shuftmp[2];
632
633 /* lower half of regs */
634 shuffles[0] = lp_build_const_int32(gallivm, 0);
635 shuffles[1] = lp_build_const_int32(gallivm, 1);
636 shuffles[2] = lp_build_const_int32(gallivm, 4);
637 shuffles[3] = lp_build_const_int32(gallivm, 5);
638 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
639 LLVMConstVector(shuffles, 4), "");
640 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
641 LLVMConstVector(shuffles, 4), "");
642
643 /* upper half of regs */
644 shuffles[0] = lp_build_const_int32(gallivm, 2);
645 shuffles[1] = lp_build_const_int32(gallivm, 3);
646 shuffles[2] = lp_build_const_int32(gallivm, 6);
647 shuffles[3] = lp_build_const_int32(gallivm, 7);
648 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
649 LLVMConstVector(shuffles, 4), "");
650 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
651 LLVMConstVector(shuffles, 4), "");
652
653 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
654 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
655
656 shuffles[0] = lp_build_const_int32(gallivm, 0);
657 shuffles[1] = lp_build_const_int32(gallivm, 2);
658 shuffles[2] = lp_build_const_int32(gallivm, 4);
659 shuffles[3] = lp_build_const_int32(gallivm, 6);
660 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
661 LLVMConstVector(shuffles, 4), "");
662
663 shuffles[0] = lp_build_const_int32(gallivm, 1);
664 shuffles[1] = lp_build_const_int32(gallivm, 3);
665 shuffles[2] = lp_build_const_int32(gallivm, 5);
666 shuffles[3] = lp_build_const_int32(gallivm, 7);
667 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
668 LLVMConstVector(shuffles, 4), "");
669
670 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
671 }
672
673
674 /*
675 * partially horizontally add 2-4 float vectors with length nx4,
676 * i.e. only four adjacent values in each vector will be added,
677 * assuming values are really grouped in 4 which also determines
678 * output order.
679 *
680 * Return a vector of the same length as the initial vectors,
681 * with the excess elements (if any) being undefined.
682 * The element order is independent of number of input vectors.
683 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
684 * the output order thus will be
685 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
686 */
687 LLVMValueRef
688 lp_build_hadd_partial4(struct lp_build_context *bld,
689 LLVMValueRef vectors[],
690 unsigned num_vecs)
691 {
692 struct gallivm_state *gallivm = bld->gallivm;
693 LLVMBuilderRef builder = gallivm->builder;
694 LLVMValueRef ret_vec;
695 LLVMValueRef tmp[4];
696 const char *intrinsic = NULL;
697
698 assert(num_vecs >= 2 && num_vecs <= 4);
699 assert(bld->type.floating);
700
701 /* only use this with at least 2 vectors, as it is sort of expensive
702 * (depending on cpu) and we always need two horizontal adds anyway,
703 * so a shuffle/add approach might be better.
704 */
705
706 tmp[0] = vectors[0];
707 tmp[1] = vectors[1];
708
709 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
710 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
711
712 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
713 bld->type.length == 4) {
714 intrinsic = "llvm.x86.sse3.hadd.ps";
715 }
716 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
717 bld->type.length == 8) {
718 intrinsic = "llvm.x86.avx.hadd.ps.256";
719 }
720 if (intrinsic) {
721 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
722 lp_build_vec_type(gallivm, bld->type),
723 tmp[0], tmp[1]);
724 if (num_vecs > 2) {
725 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
726 lp_build_vec_type(gallivm, bld->type),
727 tmp[2], tmp[3]);
728 }
729 else {
730 tmp[1] = tmp[0];
731 }
732 return lp_build_intrinsic_binary(builder, intrinsic,
733 lp_build_vec_type(gallivm, bld->type),
734 tmp[0], tmp[1]);
735 }
736
737 if (bld->type.length == 4) {
738 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
739 }
740 else {
741 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
742 unsigned j;
743 unsigned num_iter = bld->type.length / 4;
744 struct lp_type parttype = bld->type;
745 parttype.length = 4;
746 for (j = 0; j < num_iter; j++) {
747 LLVMValueRef partsrc[4];
748 unsigned i;
749 for (i = 0; i < 4; i++) {
750 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
751 }
752 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
753 }
754 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
755 }
756 return ret_vec;
757 }
758
759 /**
760 * Generate a - b
761 */
762 LLVMValueRef
763 lp_build_sub(struct lp_build_context *bld,
764 LLVMValueRef a,
765 LLVMValueRef b)
766 {
767 LLVMBuilderRef builder = bld->gallivm->builder;
768 const struct lp_type type = bld->type;
769 LLVMValueRef res;
770
771 assert(lp_check_value(type, a));
772 assert(lp_check_value(type, b));
773
774 if(b == bld->zero)
775 return a;
776 if(a == bld->undef || b == bld->undef)
777 return bld->undef;
778 if(a == b)
779 return bld->zero;
780
781 if(bld->type.norm) {
782 const char *intrinsic = NULL;
783
784 if(b == bld->one)
785 return bld->zero;
786
787 if (type.width * type.length == 128 &&
788 !type.floating && !type.fixed) {
789 if (util_cpu_caps.has_sse2) {
790 if(type.width == 8)
791 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
792 if(type.width == 16)
793 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
794 } else if (util_cpu_caps.has_altivec) {
795 if(type.width == 8)
796 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
797 if(type.width == 16)
798 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
799 }
800 }
801
802 if(intrinsic)
803 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
804 }
805
806 /* TODO: handle signed case */
807 if(type.norm && !type.floating && !type.fixed && !type.sign)
808 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
809
810 if(LLVMIsConstant(a) && LLVMIsConstant(b))
811 if (type.floating)
812 res = LLVMConstFSub(a, b);
813 else
814 res = LLVMConstSub(a, b);
815 else
816 if (type.floating)
817 res = LLVMBuildFSub(builder, a, b, "");
818 else
819 res = LLVMBuildSub(builder, a, b, "");
820
821 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
822 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
823
824 return res;
825 }
826
827
828
829 /**
830 * Normalized multiplication.
831 *
832 * There are several approaches for (using 8-bit normalized multiplication as
833 * an example):
834 *
835 * - alpha plus one
836 *
837 * makes the following approximation to the division (Sree)
838 *
839 * a*b/255 ~= (a*(b + 1)) >> 256
840 *
841 * which is the fastest method that satisfies the following OpenGL criteria of
842 *
843 * 0*0 = 0 and 255*255 = 255
844 *
845 * - geometric series
846 *
847 * takes the geometric series approximation to the division
848 *
849 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
850 *
851 * in this case just the first two terms to fit in 16bit arithmetic
852 *
853 * t/255 ~= (t + (t >> 8)) >> 8
854 *
855 * note that just by itself it doesn't satisfies the OpenGL criteria, as
856 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
857 * must be used.
858 *
859 * - geometric series plus rounding
860 *
861 * when using a geometric series division instead of truncating the result
862 * use roundoff in the approximation (Jim Blinn)
863 *
864 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
865 *
866 * achieving the exact results.
867 *
868 *
869 *
870 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
871 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
872 * @sa Michael Herf, The "double blend trick", May 2000,
873 * http://www.stereopsis.com/doubleblend.html
874 */
875 static LLVMValueRef
876 lp_build_mul_norm(struct gallivm_state *gallivm,
877 struct lp_type wide_type,
878 LLVMValueRef a, LLVMValueRef b)
879 {
880 LLVMBuilderRef builder = gallivm->builder;
881 struct lp_build_context bld;
882 unsigned n;
883 LLVMValueRef half;
884 LLVMValueRef ab;
885
886 assert(!wide_type.floating);
887 assert(lp_check_value(wide_type, a));
888 assert(lp_check_value(wide_type, b));
889
890 lp_build_context_init(&bld, gallivm, wide_type);
891
892 n = wide_type.width / 2;
893 if (wide_type.sign) {
894 --n;
895 }
896
897 /*
898 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
899 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
900 */
901
902 /*
903 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
904 */
905
906 ab = LLVMBuildMul(builder, a, b, "");
907 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
908
909 /*
910 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
911 */
912
913 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
914 if (wide_type.sign) {
915 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
916 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
917 half = lp_build_select(&bld, sign, minus_half, half);
918 }
919 ab = LLVMBuildAdd(builder, ab, half, "");
920
921 /* Final division */
922 ab = lp_build_shr_imm(&bld, ab, n);
923
924 return ab;
925 }
926
927 /**
928 * Generate a * b
929 */
930 LLVMValueRef
931 lp_build_mul(struct lp_build_context *bld,
932 LLVMValueRef a,
933 LLVMValueRef b)
934 {
935 LLVMBuilderRef builder = bld->gallivm->builder;
936 const struct lp_type type = bld->type;
937 LLVMValueRef shift;
938 LLVMValueRef res;
939
940 assert(lp_check_value(type, a));
941 assert(lp_check_value(type, b));
942
943 if(a == bld->zero)
944 return bld->zero;
945 if(a == bld->one)
946 return b;
947 if(b == bld->zero)
948 return bld->zero;
949 if(b == bld->one)
950 return a;
951 if(a == bld->undef || b == bld->undef)
952 return bld->undef;
953
954 if (!type.floating && !type.fixed && type.norm) {
955 struct lp_type wide_type = lp_wider_type(type);
956 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
957
958 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
959 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
960
961 /* PMULLW, PSRLW, PADDW */
962 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
963 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
964
965 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
966
967 return ab;
968 }
969
970 if(type.fixed)
971 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
972 else
973 shift = NULL;
974
975 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
976 if (type.floating)
977 res = LLVMConstFMul(a, b);
978 else
979 res = LLVMConstMul(a, b);
980 if(shift) {
981 if(type.sign)
982 res = LLVMConstAShr(res, shift);
983 else
984 res = LLVMConstLShr(res, shift);
985 }
986 }
987 else {
988 if (type.floating)
989 res = LLVMBuildFMul(builder, a, b, "");
990 else
991 res = LLVMBuildMul(builder, a, b, "");
992 if(shift) {
993 if(type.sign)
994 res = LLVMBuildAShr(builder, res, shift, "");
995 else
996 res = LLVMBuildLShr(builder, res, shift, "");
997 }
998 }
999
1000 return res;
1001 }
1002
1003
1004 /**
1005 * Small vector x scale multiplication optimization.
1006 */
1007 LLVMValueRef
1008 lp_build_mul_imm(struct lp_build_context *bld,
1009 LLVMValueRef a,
1010 int b)
1011 {
1012 LLVMBuilderRef builder = bld->gallivm->builder;
1013 LLVMValueRef factor;
1014
1015 assert(lp_check_value(bld->type, a));
1016
1017 if(b == 0)
1018 return bld->zero;
1019
1020 if(b == 1)
1021 return a;
1022
1023 if(b == -1)
1024 return lp_build_negate(bld, a);
1025
1026 if(b == 2 && bld->type.floating)
1027 return lp_build_add(bld, a, a);
1028
1029 if(util_is_power_of_two(b)) {
1030 unsigned shift = ffs(b) - 1;
1031
1032 if(bld->type.floating) {
1033 #if 0
1034 /*
1035 * Power of two multiplication by directly manipulating the exponent.
1036 *
1037 * XXX: This might not be always faster, it will introduce a small error
1038 * for multiplication by zero, and it will produce wrong results
1039 * for Inf and NaN.
1040 */
1041 unsigned mantissa = lp_mantissa(bld->type);
1042 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1043 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1044 a = LLVMBuildAdd(builder, a, factor, "");
1045 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1046 return a;
1047 #endif
1048 }
1049 else {
1050 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1051 return LLVMBuildShl(builder, a, factor, "");
1052 }
1053 }
1054
1055 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1056 return lp_build_mul(bld, a, factor);
1057 }
1058
1059
1060 /**
1061 * Generate a / b
1062 */
1063 LLVMValueRef
1064 lp_build_div(struct lp_build_context *bld,
1065 LLVMValueRef a,
1066 LLVMValueRef b)
1067 {
1068 LLVMBuilderRef builder = bld->gallivm->builder;
1069 const struct lp_type type = bld->type;
1070
1071 assert(lp_check_value(type, a));
1072 assert(lp_check_value(type, b));
1073
1074 if(a == bld->zero)
1075 return bld->zero;
1076 if(a == bld->one)
1077 return lp_build_rcp(bld, b);
1078 if(b == bld->zero)
1079 return bld->undef;
1080 if(b == bld->one)
1081 return a;
1082 if(a == bld->undef || b == bld->undef)
1083 return bld->undef;
1084
1085 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1086 if (type.floating)
1087 return LLVMConstFDiv(a, b);
1088 else if (type.sign)
1089 return LLVMConstSDiv(a, b);
1090 else
1091 return LLVMConstUDiv(a, b);
1092 }
1093
1094 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1095 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1096 type.floating)
1097 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1098
1099 if (type.floating)
1100 return LLVMBuildFDiv(builder, a, b, "");
1101 else if (type.sign)
1102 return LLVMBuildSDiv(builder, a, b, "");
1103 else
1104 return LLVMBuildUDiv(builder, a, b, "");
1105 }
1106
1107
1108 /**
1109 * Linear interpolation helper.
1110 *
1111 * @param normalized whether we are interpolating normalized values,
1112 * encoded in normalized integers, twice as wide.
1113 *
1114 * @sa http://www.stereopsis.com/doubleblend.html
1115 */
1116 static INLINE LLVMValueRef
1117 lp_build_lerp_simple(struct lp_build_context *bld,
1118 LLVMValueRef x,
1119 LLVMValueRef v0,
1120 LLVMValueRef v1,
1121 unsigned flags)
1122 {
1123 unsigned half_width = bld->type.width/2;
1124 LLVMBuilderRef builder = bld->gallivm->builder;
1125 LLVMValueRef delta;
1126 LLVMValueRef res;
1127
1128 assert(lp_check_value(bld->type, x));
1129 assert(lp_check_value(bld->type, v0));
1130 assert(lp_check_value(bld->type, v1));
1131
1132 delta = lp_build_sub(bld, v1, v0);
1133
1134 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1135 if (!bld->type.sign) {
1136 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1137 /*
1138 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1139 * most-significant-bit to the lowest-significant-bit, so that
1140 * later we can just divide by 2**n instead of 2**n - 1.
1141 */
1142
1143 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1144 }
1145
1146 /* (x * delta) >> n */
1147 res = lp_build_mul(bld, x, delta);
1148 res = lp_build_shr_imm(bld, res, half_width);
1149 } else {
1150 /*
1151 * The rescaling trick above doesn't work for signed numbers, so
1152 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1153 * instead.
1154 */
1155 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1156 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1157 }
1158 } else {
1159 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1160 res = lp_build_mul(bld, x, delta);
1161 }
1162
1163 res = lp_build_add(bld, v0, res);
1164
1165 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1166 bld->type.fixed) {
1167 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1168 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1169 * but it will be wrong for true fixed point use cases. Basically we need
1170 * a more powerful lp_type, capable of further distinguishing the values
1171 * interpretation from the value storage. */
1172 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1173 }
1174
1175 return res;
1176 }
1177
1178
1179 /**
1180 * Linear interpolation.
1181 */
1182 LLVMValueRef
1183 lp_build_lerp(struct lp_build_context *bld,
1184 LLVMValueRef x,
1185 LLVMValueRef v0,
1186 LLVMValueRef v1,
1187 unsigned flags)
1188 {
1189 const struct lp_type type = bld->type;
1190 LLVMValueRef res;
1191
1192 assert(lp_check_value(type, x));
1193 assert(lp_check_value(type, v0));
1194 assert(lp_check_value(type, v1));
1195
1196 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1197
1198 if (type.norm) {
1199 struct lp_type wide_type;
1200 struct lp_build_context wide_bld;
1201 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1202
1203 assert(type.length >= 2);
1204
1205 /*
1206 * Create a wider integer type, enough to hold the
1207 * intermediate result of the multiplication.
1208 */
1209 memset(&wide_type, 0, sizeof wide_type);
1210 wide_type.sign = type.sign;
1211 wide_type.width = type.width*2;
1212 wide_type.length = type.length/2;
1213
1214 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1215
1216 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1217 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1218 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1219
1220 /*
1221 * Lerp both halves.
1222 */
1223
1224 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1225
1226 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1227 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1228
1229 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1230 } else {
1231 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1232 }
1233
1234 return res;
1235 }
1236
1237
1238 /**
1239 * Bilinear interpolation.
1240 *
1241 * Values indices are in v_{yx}.
1242 */
1243 LLVMValueRef
1244 lp_build_lerp_2d(struct lp_build_context *bld,
1245 LLVMValueRef x,
1246 LLVMValueRef y,
1247 LLVMValueRef v00,
1248 LLVMValueRef v01,
1249 LLVMValueRef v10,
1250 LLVMValueRef v11,
1251 unsigned flags)
1252 {
1253 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1254 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1255 return lp_build_lerp(bld, y, v0, v1, flags);
1256 }
1257
1258
1259 LLVMValueRef
1260 lp_build_lerp_3d(struct lp_build_context *bld,
1261 LLVMValueRef x,
1262 LLVMValueRef y,
1263 LLVMValueRef z,
1264 LLVMValueRef v000,
1265 LLVMValueRef v001,
1266 LLVMValueRef v010,
1267 LLVMValueRef v011,
1268 LLVMValueRef v100,
1269 LLVMValueRef v101,
1270 LLVMValueRef v110,
1271 LLVMValueRef v111,
1272 unsigned flags)
1273 {
1274 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1275 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1276 return lp_build_lerp(bld, z, v0, v1, flags);
1277 }
1278
1279
1280 /**
1281 * Generate min(a, b)
1282 * Do checks for special cases but not for nans.
1283 */
1284 LLVMValueRef
1285 lp_build_min(struct lp_build_context *bld,
1286 LLVMValueRef a,
1287 LLVMValueRef b)
1288 {
1289 assert(lp_check_value(bld->type, a));
1290 assert(lp_check_value(bld->type, b));
1291
1292 if(a == bld->undef || b == bld->undef)
1293 return bld->undef;
1294
1295 if(a == b)
1296 return a;
1297
1298 if (bld->type.norm) {
1299 if (!bld->type.sign) {
1300 if (a == bld->zero || b == bld->zero) {
1301 return bld->zero;
1302 }
1303 }
1304 if(a == bld->one)
1305 return b;
1306 if(b == bld->one)
1307 return a;
1308 }
1309
1310 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1311 }
1312
1313
1314 /**
1315 * Generate min(a, b)
1316 * NaN's are handled according to the behavior specified by the
1317 * nan_behavior argument.
1318 */
1319 LLVMValueRef
1320 lp_build_min_ext(struct lp_build_context *bld,
1321 LLVMValueRef a,
1322 LLVMValueRef b,
1323 enum gallivm_nan_behavior nan_behavior)
1324 {
1325 assert(lp_check_value(bld->type, a));
1326 assert(lp_check_value(bld->type, b));
1327
1328 if(a == bld->undef || b == bld->undef)
1329 return bld->undef;
1330
1331 if(a == b)
1332 return a;
1333
1334 if (bld->type.norm) {
1335 if (!bld->type.sign) {
1336 if (a == bld->zero || b == bld->zero) {
1337 return bld->zero;
1338 }
1339 }
1340 if(a == bld->one)
1341 return b;
1342 if(b == bld->one)
1343 return a;
1344 }
1345
1346 return lp_build_min_simple(bld, a, b, nan_behavior);
1347 }
1348
1349 /**
1350 * Generate max(a, b)
1351 * Do checks for special cases, but NaN behavior is undefined.
1352 */
1353 LLVMValueRef
1354 lp_build_max(struct lp_build_context *bld,
1355 LLVMValueRef a,
1356 LLVMValueRef b)
1357 {
1358 assert(lp_check_value(bld->type, a));
1359 assert(lp_check_value(bld->type, b));
1360
1361 if(a == bld->undef || b == bld->undef)
1362 return bld->undef;
1363
1364 if(a == b)
1365 return a;
1366
1367 if(bld->type.norm) {
1368 if(a == bld->one || b == bld->one)
1369 return bld->one;
1370 if (!bld->type.sign) {
1371 if (a == bld->zero) {
1372 return b;
1373 }
1374 if (b == bld->zero) {
1375 return a;
1376 }
1377 }
1378 }
1379
1380 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1381 }
1382
1383
1384 /**
1385 * Generate max(a, b)
1386 * Checks for special cases.
1387 * NaN's are handled according to the behavior specified by the
1388 * nan_behavior argument.
1389 */
1390 LLVMValueRef
1391 lp_build_max_ext(struct lp_build_context *bld,
1392 LLVMValueRef a,
1393 LLVMValueRef b,
1394 enum gallivm_nan_behavior nan_behavior)
1395 {
1396 assert(lp_check_value(bld->type, a));
1397 assert(lp_check_value(bld->type, b));
1398
1399 if(a == bld->undef || b == bld->undef)
1400 return bld->undef;
1401
1402 if(a == b)
1403 return a;
1404
1405 if(bld->type.norm) {
1406 if(a == bld->one || b == bld->one)
1407 return bld->one;
1408 if (!bld->type.sign) {
1409 if (a == bld->zero) {
1410 return b;
1411 }
1412 if (b == bld->zero) {
1413 return a;
1414 }
1415 }
1416 }
1417
1418 return lp_build_max_simple(bld, a, b, nan_behavior);
1419 }
1420
1421 /**
1422 * Generate clamp(a, min, max)
1423 * NaN behavior (for any of a, min, max) is undefined.
1424 * Do checks for special cases.
1425 */
1426 LLVMValueRef
1427 lp_build_clamp(struct lp_build_context *bld,
1428 LLVMValueRef a,
1429 LLVMValueRef min,
1430 LLVMValueRef max)
1431 {
1432 assert(lp_check_value(bld->type, a));
1433 assert(lp_check_value(bld->type, min));
1434 assert(lp_check_value(bld->type, max));
1435
1436 a = lp_build_min(bld, a, max);
1437 a = lp_build_max(bld, a, min);
1438 return a;
1439 }
1440
1441
1442 /**
1443 * Generate clamp(a, 0, 1)
1444 * A NaN will get converted to zero.
1445 */
1446 LLVMValueRef
1447 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1448 LLVMValueRef a)
1449 {
1450 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1451 a = lp_build_min(bld, a, bld->one);
1452 return a;
1453 }
1454
1455
1456 /**
1457 * Generate abs(a)
1458 */
1459 LLVMValueRef
1460 lp_build_abs(struct lp_build_context *bld,
1461 LLVMValueRef a)
1462 {
1463 LLVMBuilderRef builder = bld->gallivm->builder;
1464 const struct lp_type type = bld->type;
1465 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1466
1467 assert(lp_check_value(type, a));
1468
1469 if(!type.sign)
1470 return a;
1471
1472 if(type.floating) {
1473 /* Mask out the sign bit */
1474 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1475 unsigned long long absMask = ~(1ULL << (type.width - 1));
1476 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1477 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1478 a = LLVMBuildAnd(builder, a, mask, "");
1479 a = LLVMBuildBitCast(builder, a, vec_type, "");
1480 return a;
1481 }
1482
1483 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1484 switch(type.width) {
1485 case 8:
1486 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1487 case 16:
1488 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1489 case 32:
1490 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1491 }
1492 }
1493 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1494 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1495 (type.width == 8 || type.width == 16 || type.width == 32)) {
1496 debug_printf("%s: inefficient code, should split vectors manually\n",
1497 __FUNCTION__);
1498 }
1499
1500 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1501 }
1502
1503
1504 LLVMValueRef
1505 lp_build_negate(struct lp_build_context *bld,
1506 LLVMValueRef a)
1507 {
1508 LLVMBuilderRef builder = bld->gallivm->builder;
1509
1510 assert(lp_check_value(bld->type, a));
1511
1512 if (bld->type.floating)
1513 a = LLVMBuildFNeg(builder, a, "");
1514 else
1515 a = LLVMBuildNeg(builder, a, "");
1516
1517 return a;
1518 }
1519
1520
1521 /** Return -1, 0 or +1 depending on the sign of a */
1522 LLVMValueRef
1523 lp_build_sgn(struct lp_build_context *bld,
1524 LLVMValueRef a)
1525 {
1526 LLVMBuilderRef builder = bld->gallivm->builder;
1527 const struct lp_type type = bld->type;
1528 LLVMValueRef cond;
1529 LLVMValueRef res;
1530
1531 assert(lp_check_value(type, a));
1532
1533 /* Handle non-zero case */
1534 if(!type.sign) {
1535 /* if not zero then sign must be positive */
1536 res = bld->one;
1537 }
1538 else if(type.floating) {
1539 LLVMTypeRef vec_type;
1540 LLVMTypeRef int_type;
1541 LLVMValueRef mask;
1542 LLVMValueRef sign;
1543 LLVMValueRef one;
1544 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1545
1546 int_type = lp_build_int_vec_type(bld->gallivm, type);
1547 vec_type = lp_build_vec_type(bld->gallivm, type);
1548 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1549
1550 /* Take the sign bit and add it to 1 constant */
1551 sign = LLVMBuildBitCast(builder, a, int_type, "");
1552 sign = LLVMBuildAnd(builder, sign, mask, "");
1553 one = LLVMConstBitCast(bld->one, int_type);
1554 res = LLVMBuildOr(builder, sign, one, "");
1555 res = LLVMBuildBitCast(builder, res, vec_type, "");
1556 }
1557 else
1558 {
1559 /* signed int/norm/fixed point */
1560 /* could use psign with sse3 and appropriate vectors here */
1561 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1562 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1563 res = lp_build_select(bld, cond, bld->one, minus_one);
1564 }
1565
1566 /* Handle zero */
1567 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1568 res = lp_build_select(bld, cond, bld->zero, res);
1569
1570 return res;
1571 }
1572
1573
1574 /**
1575 * Set the sign of float vector 'a' according to 'sign'.
1576 * If sign==0, return abs(a).
1577 * If sign==1, return -abs(a);
1578 * Other values for sign produce undefined results.
1579 */
1580 LLVMValueRef
1581 lp_build_set_sign(struct lp_build_context *bld,
1582 LLVMValueRef a, LLVMValueRef sign)
1583 {
1584 LLVMBuilderRef builder = bld->gallivm->builder;
1585 const struct lp_type type = bld->type;
1586 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1587 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1588 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1589 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1590 ~((unsigned long long) 1 << (type.width - 1)));
1591 LLVMValueRef val, res;
1592
1593 assert(type.floating);
1594 assert(lp_check_value(type, a));
1595
1596 /* val = reinterpret_cast<int>(a) */
1597 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1598 /* val = val & mask */
1599 val = LLVMBuildAnd(builder, val, mask, "");
1600 /* sign = sign << shift */
1601 sign = LLVMBuildShl(builder, sign, shift, "");
1602 /* res = val | sign */
1603 res = LLVMBuildOr(builder, val, sign, "");
1604 /* res = reinterpret_cast<float>(res) */
1605 res = LLVMBuildBitCast(builder, res, vec_type, "");
1606
1607 return res;
1608 }
1609
1610
1611 /**
1612 * Convert vector of (or scalar) int to vector of (or scalar) float.
1613 */
1614 LLVMValueRef
1615 lp_build_int_to_float(struct lp_build_context *bld,
1616 LLVMValueRef a)
1617 {
1618 LLVMBuilderRef builder = bld->gallivm->builder;
1619 const struct lp_type type = bld->type;
1620 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1621
1622 assert(type.floating);
1623
1624 return LLVMBuildSIToFP(builder, a, vec_type, "");
1625 }
1626
1627 static boolean
1628 arch_rounding_available(const struct lp_type type)
1629 {
1630 if ((util_cpu_caps.has_sse4_1 &&
1631 (type.length == 1 || type.width*type.length == 128)) ||
1632 (util_cpu_caps.has_avx && type.width*type.length == 256))
1633 return TRUE;
1634 else if ((util_cpu_caps.has_altivec &&
1635 (type.width == 32 && type.length == 4)))
1636 return TRUE;
1637
1638 return FALSE;
1639 }
1640
1641 enum lp_build_round_mode
1642 {
1643 LP_BUILD_ROUND_NEAREST = 0,
1644 LP_BUILD_ROUND_FLOOR = 1,
1645 LP_BUILD_ROUND_CEIL = 2,
1646 LP_BUILD_ROUND_TRUNCATE = 3
1647 };
1648
1649 /**
1650 * Helper for SSE4.1's ROUNDxx instructions.
1651 *
1652 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1653 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1654 */
1655 static INLINE LLVMValueRef
1656 lp_build_round_sse41(struct lp_build_context *bld,
1657 LLVMValueRef a,
1658 enum lp_build_round_mode mode)
1659 {
1660 LLVMBuilderRef builder = bld->gallivm->builder;
1661 const struct lp_type type = bld->type;
1662 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1663 const char *intrinsic;
1664 LLVMValueRef res;
1665
1666 assert(type.floating);
1667
1668 assert(lp_check_value(type, a));
1669 assert(util_cpu_caps.has_sse4_1);
1670
1671 if (type.length == 1) {
1672 LLVMTypeRef vec_type;
1673 LLVMValueRef undef;
1674 LLVMValueRef args[3];
1675 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1676
1677 switch(type.width) {
1678 case 32:
1679 intrinsic = "llvm.x86.sse41.round.ss";
1680 break;
1681 case 64:
1682 intrinsic = "llvm.x86.sse41.round.sd";
1683 break;
1684 default:
1685 assert(0);
1686 return bld->undef;
1687 }
1688
1689 vec_type = LLVMVectorType(bld->elem_type, 4);
1690
1691 undef = LLVMGetUndef(vec_type);
1692
1693 args[0] = undef;
1694 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1695 args[2] = LLVMConstInt(i32t, mode, 0);
1696
1697 res = lp_build_intrinsic(builder, intrinsic,
1698 vec_type, args, Elements(args));
1699
1700 res = LLVMBuildExtractElement(builder, res, index0, "");
1701 }
1702 else {
1703 if (type.width * type.length == 128) {
1704 switch(type.width) {
1705 case 32:
1706 intrinsic = "llvm.x86.sse41.round.ps";
1707 break;
1708 case 64:
1709 intrinsic = "llvm.x86.sse41.round.pd";
1710 break;
1711 default:
1712 assert(0);
1713 return bld->undef;
1714 }
1715 }
1716 else {
1717 assert(type.width * type.length == 256);
1718 assert(util_cpu_caps.has_avx);
1719
1720 switch(type.width) {
1721 case 32:
1722 intrinsic = "llvm.x86.avx.round.ps.256";
1723 break;
1724 case 64:
1725 intrinsic = "llvm.x86.avx.round.pd.256";
1726 break;
1727 default:
1728 assert(0);
1729 return bld->undef;
1730 }
1731 }
1732
1733 res = lp_build_intrinsic_binary(builder, intrinsic,
1734 bld->vec_type, a,
1735 LLVMConstInt(i32t, mode, 0));
1736 }
1737
1738 return res;
1739 }
1740
1741
1742 static INLINE LLVMValueRef
1743 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1744 LLVMValueRef a)
1745 {
1746 LLVMBuilderRef builder = bld->gallivm->builder;
1747 const struct lp_type type = bld->type;
1748 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1749 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1750 const char *intrinsic;
1751 LLVMValueRef res;
1752
1753 assert(type.floating);
1754 /* using the double precision conversions is a bit more complicated */
1755 assert(type.width == 32);
1756
1757 assert(lp_check_value(type, a));
1758 assert(util_cpu_caps.has_sse2);
1759
1760 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1761 if (type.length == 1) {
1762 LLVMTypeRef vec_type;
1763 LLVMValueRef undef;
1764 LLVMValueRef arg;
1765 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1766
1767 vec_type = LLVMVectorType(bld->elem_type, 4);
1768
1769 intrinsic = "llvm.x86.sse.cvtss2si";
1770
1771 undef = LLVMGetUndef(vec_type);
1772
1773 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1774
1775 res = lp_build_intrinsic_unary(builder, intrinsic,
1776 ret_type, arg);
1777 }
1778 else {
1779 if (type.width* type.length == 128) {
1780 intrinsic = "llvm.x86.sse2.cvtps2dq";
1781 }
1782 else {
1783 assert(type.width*type.length == 256);
1784 assert(util_cpu_caps.has_avx);
1785
1786 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1787 }
1788 res = lp_build_intrinsic_unary(builder, intrinsic,
1789 ret_type, a);
1790 }
1791
1792 return res;
1793 }
1794
1795
1796 /*
1797 */
1798 static INLINE LLVMValueRef
1799 lp_build_round_altivec(struct lp_build_context *bld,
1800 LLVMValueRef a,
1801 enum lp_build_round_mode mode)
1802 {
1803 LLVMBuilderRef builder = bld->gallivm->builder;
1804 const struct lp_type type = bld->type;
1805 const char *intrinsic = NULL;
1806
1807 assert(type.floating);
1808
1809 assert(lp_check_value(type, a));
1810 assert(util_cpu_caps.has_altivec);
1811
1812 switch (mode) {
1813 case LP_BUILD_ROUND_NEAREST:
1814 intrinsic = "llvm.ppc.altivec.vrfin";
1815 break;
1816 case LP_BUILD_ROUND_FLOOR:
1817 intrinsic = "llvm.ppc.altivec.vrfim";
1818 break;
1819 case LP_BUILD_ROUND_CEIL:
1820 intrinsic = "llvm.ppc.altivec.vrfip";
1821 break;
1822 case LP_BUILD_ROUND_TRUNCATE:
1823 intrinsic = "llvm.ppc.altivec.vrfiz";
1824 break;
1825 }
1826
1827 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1828 }
1829
1830 static INLINE LLVMValueRef
1831 lp_build_round_arch(struct lp_build_context *bld,
1832 LLVMValueRef a,
1833 enum lp_build_round_mode mode)
1834 {
1835 if (util_cpu_caps.has_sse4_1)
1836 return lp_build_round_sse41(bld, a, mode);
1837 else /* (util_cpu_caps.has_altivec) */
1838 return lp_build_round_altivec(bld, a, mode);
1839 }
1840
1841 /**
1842 * Return the integer part of a float (vector) value (== round toward zero).
1843 * The returned value is a float (vector).
1844 * Ex: trunc(-1.5) = -1.0
1845 */
1846 LLVMValueRef
1847 lp_build_trunc(struct lp_build_context *bld,
1848 LLVMValueRef a)
1849 {
1850 LLVMBuilderRef builder = bld->gallivm->builder;
1851 const struct lp_type type = bld->type;
1852
1853 assert(type.floating);
1854 assert(lp_check_value(type, a));
1855
1856 if (arch_rounding_available(type)) {
1857 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1858 }
1859 else {
1860 const struct lp_type type = bld->type;
1861 struct lp_type inttype;
1862 struct lp_build_context intbld;
1863 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1864 LLVMValueRef trunc, res, anosign, mask;
1865 LLVMTypeRef int_vec_type = bld->int_vec_type;
1866 LLVMTypeRef vec_type = bld->vec_type;
1867
1868 assert(type.width == 32); /* might want to handle doubles at some point */
1869
1870 inttype = type;
1871 inttype.floating = 0;
1872 lp_build_context_init(&intbld, bld->gallivm, inttype);
1873
1874 /* round by truncation */
1875 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1876 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1877
1878 /* mask out sign bit */
1879 anosign = lp_build_abs(bld, a);
1880 /*
1881 * mask out all values if anosign > 2^24
1882 * This should work both for large ints (all rounding is no-op for them
1883 * because such floats are always exact) as well as special cases like
1884 * NaNs, Infs (taking advantage of the fact they use max exponent).
1885 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1886 */
1887 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1888 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1889 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1890 return lp_build_select(bld, mask, a, res);
1891 }
1892 }
1893
1894
1895 /**
1896 * Return float (vector) rounded to nearest integer (vector). The returned
1897 * value is a float (vector).
1898 * Ex: round(0.9) = 1.0
1899 * Ex: round(-1.5) = -2.0
1900 */
1901 LLVMValueRef
1902 lp_build_round(struct lp_build_context *bld,
1903 LLVMValueRef a)
1904 {
1905 LLVMBuilderRef builder = bld->gallivm->builder;
1906 const struct lp_type type = bld->type;
1907
1908 assert(type.floating);
1909 assert(lp_check_value(type, a));
1910
1911 if (arch_rounding_available(type)) {
1912 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1913 }
1914 else {
1915 const struct lp_type type = bld->type;
1916 struct lp_type inttype;
1917 struct lp_build_context intbld;
1918 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1919 LLVMValueRef res, anosign, mask;
1920 LLVMTypeRef int_vec_type = bld->int_vec_type;
1921 LLVMTypeRef vec_type = bld->vec_type;
1922
1923 assert(type.width == 32); /* might want to handle doubles at some point */
1924
1925 inttype = type;
1926 inttype.floating = 0;
1927 lp_build_context_init(&intbld, bld->gallivm, inttype);
1928
1929 res = lp_build_iround(bld, a);
1930 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1931
1932 /* mask out sign bit */
1933 anosign = lp_build_abs(bld, a);
1934 /*
1935 * mask out all values if anosign > 2^24
1936 * This should work both for large ints (all rounding is no-op for them
1937 * because such floats are always exact) as well as special cases like
1938 * NaNs, Infs (taking advantage of the fact they use max exponent).
1939 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1940 */
1941 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1942 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1943 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1944 return lp_build_select(bld, mask, a, res);
1945 }
1946 }
1947
1948
1949 /**
1950 * Return floor of float (vector), result is a float (vector)
1951 * Ex: floor(1.1) = 1.0
1952 * Ex: floor(-1.1) = -2.0
1953 */
1954 LLVMValueRef
1955 lp_build_floor(struct lp_build_context *bld,
1956 LLVMValueRef a)
1957 {
1958 LLVMBuilderRef builder = bld->gallivm->builder;
1959 const struct lp_type type = bld->type;
1960
1961 assert(type.floating);
1962 assert(lp_check_value(type, a));
1963
1964 if (arch_rounding_available(type)) {
1965 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1966 }
1967 else {
1968 const struct lp_type type = bld->type;
1969 struct lp_type inttype;
1970 struct lp_build_context intbld;
1971 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1972 LLVMValueRef trunc, res, anosign, mask;
1973 LLVMTypeRef int_vec_type = bld->int_vec_type;
1974 LLVMTypeRef vec_type = bld->vec_type;
1975
1976 assert(type.width == 32); /* might want to handle doubles at some point */
1977
1978 inttype = type;
1979 inttype.floating = 0;
1980 lp_build_context_init(&intbld, bld->gallivm, inttype);
1981
1982 /* round by truncation */
1983 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1984 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1985
1986 if (type.sign) {
1987 LLVMValueRef tmp;
1988
1989 /*
1990 * fix values if rounding is wrong (for non-special cases)
1991 * - this is the case if trunc > a
1992 */
1993 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1994 /* tmp = trunc > a ? 1.0 : 0.0 */
1995 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1996 tmp = lp_build_and(&intbld, mask, tmp);
1997 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1998 res = lp_build_sub(bld, res, tmp);
1999 }
2000
2001 /* mask out sign bit */
2002 anosign = lp_build_abs(bld, a);
2003 /*
2004 * mask out all values if anosign > 2^24
2005 * This should work both for large ints (all rounding is no-op for them
2006 * because such floats are always exact) as well as special cases like
2007 * NaNs, Infs (taking advantage of the fact they use max exponent).
2008 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2009 */
2010 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2011 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2012 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2013 return lp_build_select(bld, mask, a, res);
2014 }
2015 }
2016
2017
2018 /**
2019 * Return ceiling of float (vector), returning float (vector).
2020 * Ex: ceil( 1.1) = 2.0
2021 * Ex: ceil(-1.1) = -1.0
2022 */
2023 LLVMValueRef
2024 lp_build_ceil(struct lp_build_context *bld,
2025 LLVMValueRef a)
2026 {
2027 LLVMBuilderRef builder = bld->gallivm->builder;
2028 const struct lp_type type = bld->type;
2029
2030 assert(type.floating);
2031 assert(lp_check_value(type, a));
2032
2033 if (arch_rounding_available(type)) {
2034 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2035 }
2036 else {
2037 const struct lp_type type = bld->type;
2038 struct lp_type inttype;
2039 struct lp_build_context intbld;
2040 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2041 LLVMValueRef trunc, res, anosign, mask, tmp;
2042 LLVMTypeRef int_vec_type = bld->int_vec_type;
2043 LLVMTypeRef vec_type = bld->vec_type;
2044
2045 assert(type.width == 32); /* might want to handle doubles at some point */
2046
2047 inttype = type;
2048 inttype.floating = 0;
2049 lp_build_context_init(&intbld, bld->gallivm, inttype);
2050
2051 /* round by truncation */
2052 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2053 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2054
2055 /*
2056 * fix values if rounding is wrong (for non-special cases)
2057 * - this is the case if trunc < a
2058 */
2059 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2060 /* tmp = trunc < a ? 1.0 : 0.0 */
2061 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2062 tmp = lp_build_and(&intbld, mask, tmp);
2063 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2064 res = lp_build_add(bld, trunc, tmp);
2065
2066 /* mask out sign bit */
2067 anosign = lp_build_abs(bld, a);
2068 /*
2069 * mask out all values if anosign > 2^24
2070 * This should work both for large ints (all rounding is no-op for them
2071 * because such floats are always exact) as well as special cases like
2072 * NaNs, Infs (taking advantage of the fact they use max exponent).
2073 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2074 */
2075 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2076 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2077 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2078 return lp_build_select(bld, mask, a, res);
2079 }
2080 }
2081
2082
2083 /**
2084 * Return fractional part of 'a' computed as a - floor(a)
2085 * Typically used in texture coord arithmetic.
2086 */
2087 LLVMValueRef
2088 lp_build_fract(struct lp_build_context *bld,
2089 LLVMValueRef a)
2090 {
2091 assert(bld->type.floating);
2092 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2093 }
2094
2095
2096 /**
2097 * Prevent returning a fractional part of 1.0 for very small negative values of
2098 * 'a' by clamping against 0.99999(9).
2099 */
2100 static inline LLVMValueRef
2101 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2102 {
2103 LLVMValueRef max;
2104
2105 /* this is the largest number smaller than 1.0 representable as float */
2106 max = lp_build_const_vec(bld->gallivm, bld->type,
2107 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2108 return lp_build_min(bld, fract, max);
2109 }
2110
2111
2112 /**
2113 * Same as lp_build_fract, but guarantees that the result is always smaller
2114 * than one.
2115 */
2116 LLVMValueRef
2117 lp_build_fract_safe(struct lp_build_context *bld,
2118 LLVMValueRef a)
2119 {
2120 return clamp_fract(bld, lp_build_fract(bld, a));
2121 }
2122
2123
2124 /**
2125 * Return the integer part of a float (vector) value (== round toward zero).
2126 * The returned value is an integer (vector).
2127 * Ex: itrunc(-1.5) = -1
2128 */
2129 LLVMValueRef
2130 lp_build_itrunc(struct lp_build_context *bld,
2131 LLVMValueRef a)
2132 {
2133 LLVMBuilderRef builder = bld->gallivm->builder;
2134 const struct lp_type type = bld->type;
2135 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2136
2137 assert(type.floating);
2138 assert(lp_check_value(type, a));
2139
2140 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2141 }
2142
2143
2144 /**
2145 * Return float (vector) rounded to nearest integer (vector). The returned
2146 * value is an integer (vector).
2147 * Ex: iround(0.9) = 1
2148 * Ex: iround(-1.5) = -2
2149 */
2150 LLVMValueRef
2151 lp_build_iround(struct lp_build_context *bld,
2152 LLVMValueRef a)
2153 {
2154 LLVMBuilderRef builder = bld->gallivm->builder;
2155 const struct lp_type type = bld->type;
2156 LLVMTypeRef int_vec_type = bld->int_vec_type;
2157 LLVMValueRef res;
2158
2159 assert(type.floating);
2160
2161 assert(lp_check_value(type, a));
2162
2163 if ((util_cpu_caps.has_sse2 &&
2164 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2165 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2166 return lp_build_iround_nearest_sse2(bld, a);
2167 }
2168 if (arch_rounding_available(type)) {
2169 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2170 }
2171 else {
2172 LLVMValueRef half;
2173
2174 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2175
2176 if (type.sign) {
2177 LLVMTypeRef vec_type = bld->vec_type;
2178 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2179 (unsigned long long)1 << (type.width - 1));
2180 LLVMValueRef sign;
2181
2182 /* get sign bit */
2183 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2184 sign = LLVMBuildAnd(builder, sign, mask, "");
2185
2186 /* sign * 0.5 */
2187 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2188 half = LLVMBuildOr(builder, sign, half, "");
2189 half = LLVMBuildBitCast(builder, half, vec_type, "");
2190 }
2191
2192 res = LLVMBuildFAdd(builder, a, half, "");
2193 }
2194
2195 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2196
2197 return res;
2198 }
2199
2200
2201 /**
2202 * Return floor of float (vector), result is an int (vector)
2203 * Ex: ifloor(1.1) = 1.0
2204 * Ex: ifloor(-1.1) = -2.0
2205 */
2206 LLVMValueRef
2207 lp_build_ifloor(struct lp_build_context *bld,
2208 LLVMValueRef a)
2209 {
2210 LLVMBuilderRef builder = bld->gallivm->builder;
2211 const struct lp_type type = bld->type;
2212 LLVMTypeRef int_vec_type = bld->int_vec_type;
2213 LLVMValueRef res;
2214
2215 assert(type.floating);
2216 assert(lp_check_value(type, a));
2217
2218 res = a;
2219 if (type.sign) {
2220 if (arch_rounding_available(type)) {
2221 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2222 }
2223 else {
2224 struct lp_type inttype;
2225 struct lp_build_context intbld;
2226 LLVMValueRef trunc, itrunc, mask;
2227
2228 assert(type.floating);
2229 assert(lp_check_value(type, a));
2230
2231 inttype = type;
2232 inttype.floating = 0;
2233 lp_build_context_init(&intbld, bld->gallivm, inttype);
2234
2235 /* round by truncation */
2236 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2237 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2238
2239 /*
2240 * fix values if rounding is wrong (for non-special cases)
2241 * - this is the case if trunc > a
2242 * The results of doing this with NaNs, very large values etc.
2243 * are undefined but this seems to be the case anyway.
2244 */
2245 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2246 /* cheapie minus one with mask since the mask is minus one / zero */
2247 return lp_build_add(&intbld, itrunc, mask);
2248 }
2249 }
2250
2251 /* round to nearest (toward zero) */
2252 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2253
2254 return res;
2255 }
2256
2257
2258 /**
2259 * Return ceiling of float (vector), returning int (vector).
2260 * Ex: iceil( 1.1) = 2
2261 * Ex: iceil(-1.1) = -1
2262 */
2263 LLVMValueRef
2264 lp_build_iceil(struct lp_build_context *bld,
2265 LLVMValueRef a)
2266 {
2267 LLVMBuilderRef builder = bld->gallivm->builder;
2268 const struct lp_type type = bld->type;
2269 LLVMTypeRef int_vec_type = bld->int_vec_type;
2270 LLVMValueRef res;
2271
2272 assert(type.floating);
2273 assert(lp_check_value(type, a));
2274
2275 if (arch_rounding_available(type)) {
2276 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2277 }
2278 else {
2279 struct lp_type inttype;
2280 struct lp_build_context intbld;
2281 LLVMValueRef trunc, itrunc, mask;
2282
2283 assert(type.floating);
2284 assert(lp_check_value(type, a));
2285
2286 inttype = type;
2287 inttype.floating = 0;
2288 lp_build_context_init(&intbld, bld->gallivm, inttype);
2289
2290 /* round by truncation */
2291 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2292 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2293
2294 /*
2295 * fix values if rounding is wrong (for non-special cases)
2296 * - this is the case if trunc < a
2297 * The results of doing this with NaNs, very large values etc.
2298 * are undefined but this seems to be the case anyway.
2299 */
2300 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2301 /* cheapie plus one with mask since the mask is minus one / zero */
2302 return lp_build_sub(&intbld, itrunc, mask);
2303 }
2304
2305 /* round to nearest (toward zero) */
2306 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2307
2308 return res;
2309 }
2310
2311
2312 /**
2313 * Combined ifloor() & fract().
2314 *
2315 * Preferred to calling the functions separately, as it will ensure that the
2316 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2317 */
2318 void
2319 lp_build_ifloor_fract(struct lp_build_context *bld,
2320 LLVMValueRef a,
2321 LLVMValueRef *out_ipart,
2322 LLVMValueRef *out_fpart)
2323 {
2324 LLVMBuilderRef builder = bld->gallivm->builder;
2325 const struct lp_type type = bld->type;
2326 LLVMValueRef ipart;
2327
2328 assert(type.floating);
2329 assert(lp_check_value(type, a));
2330
2331 if (arch_rounding_available(type)) {
2332 /*
2333 * floor() is easier.
2334 */
2335
2336 ipart = lp_build_floor(bld, a);
2337 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2338 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2339 }
2340 else {
2341 /*
2342 * ifloor() is easier.
2343 */
2344
2345 *out_ipart = lp_build_ifloor(bld, a);
2346 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2347 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2348 }
2349 }
2350
2351
2352 /**
2353 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2354 * always smaller than one.
2355 */
2356 void
2357 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2358 LLVMValueRef a,
2359 LLVMValueRef *out_ipart,
2360 LLVMValueRef *out_fpart)
2361 {
2362 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2363 *out_fpart = clamp_fract(bld, *out_fpart);
2364 }
2365
2366
2367 LLVMValueRef
2368 lp_build_sqrt(struct lp_build_context *bld,
2369 LLVMValueRef a)
2370 {
2371 LLVMBuilderRef builder = bld->gallivm->builder;
2372 const struct lp_type type = bld->type;
2373 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2374 char intrinsic[32];
2375
2376 assert(lp_check_value(type, a));
2377
2378 /* TODO: optimize the constant case */
2379
2380 assert(type.floating);
2381 if (type.length == 1) {
2382 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2383 }
2384 else {
2385 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2386 }
2387
2388 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2389 }
2390
2391
2392 /**
2393 * Do one Newton-Raphson step to improve reciprocate precision:
2394 *
2395 * x_{i+1} = x_i * (2 - a * x_i)
2396 *
2397 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2398 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2399 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2400 * halo. It would be necessary to clamp the argument to prevent this.
2401 *
2402 * See also:
2403 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2404 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2405 */
2406 static INLINE LLVMValueRef
2407 lp_build_rcp_refine(struct lp_build_context *bld,
2408 LLVMValueRef a,
2409 LLVMValueRef rcp_a)
2410 {
2411 LLVMBuilderRef builder = bld->gallivm->builder;
2412 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2413 LLVMValueRef res;
2414
2415 res = LLVMBuildFMul(builder, a, rcp_a, "");
2416 res = LLVMBuildFSub(builder, two, res, "");
2417 res = LLVMBuildFMul(builder, rcp_a, res, "");
2418
2419 return res;
2420 }
2421
2422
2423 LLVMValueRef
2424 lp_build_rcp(struct lp_build_context *bld,
2425 LLVMValueRef a)
2426 {
2427 LLVMBuilderRef builder = bld->gallivm->builder;
2428 const struct lp_type type = bld->type;
2429
2430 assert(lp_check_value(type, a));
2431
2432 if(a == bld->zero)
2433 return bld->undef;
2434 if(a == bld->one)
2435 return bld->one;
2436 if(a == bld->undef)
2437 return bld->undef;
2438
2439 assert(type.floating);
2440
2441 if(LLVMIsConstant(a))
2442 return LLVMConstFDiv(bld->one, a);
2443
2444 /*
2445 * We don't use RCPPS because:
2446 * - it only has 10bits of precision
2447 * - it doesn't even get the reciprocate of 1.0 exactly
2448 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2449 * - for recent processors the benefit over DIVPS is marginal, a case
2450 * dependent
2451 *
2452 * We could still use it on certain processors if benchmarks show that the
2453 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2454 * particular uses that require less workarounds.
2455 */
2456
2457 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2458 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2459 const unsigned num_iterations = 0;
2460 LLVMValueRef res;
2461 unsigned i;
2462 const char *intrinsic = NULL;
2463
2464 if (type.length == 4) {
2465 intrinsic = "llvm.x86.sse.rcp.ps";
2466 }
2467 else {
2468 intrinsic = "llvm.x86.avx.rcp.ps.256";
2469 }
2470
2471 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2472
2473 for (i = 0; i < num_iterations; ++i) {
2474 res = lp_build_rcp_refine(bld, a, res);
2475 }
2476
2477 return res;
2478 }
2479
2480 return LLVMBuildFDiv(builder, bld->one, a, "");
2481 }
2482
2483
2484 /**
2485 * Do one Newton-Raphson step to improve rsqrt precision:
2486 *
2487 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2488 *
2489 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2490 */
2491 static INLINE LLVMValueRef
2492 lp_build_rsqrt_refine(struct lp_build_context *bld,
2493 LLVMValueRef a,
2494 LLVMValueRef rsqrt_a)
2495 {
2496 LLVMBuilderRef builder = bld->gallivm->builder;
2497 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2498 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2499 LLVMValueRef res;
2500
2501 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2502 res = LLVMBuildFMul(builder, a, res, "");
2503 res = LLVMBuildFSub(builder, three, res, "");
2504 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2505 res = LLVMBuildFMul(builder, half, res, "");
2506
2507 return res;
2508 }
2509
2510
2511 /**
2512 * Generate 1/sqrt(a).
2513 * Result is undefined for values < 0, infinity for +0.
2514 */
2515 LLVMValueRef
2516 lp_build_rsqrt(struct lp_build_context *bld,
2517 LLVMValueRef a)
2518 {
2519 LLVMBuilderRef builder = bld->gallivm->builder;
2520 const struct lp_type type = bld->type;
2521
2522 assert(lp_check_value(type, a));
2523
2524 assert(type.floating);
2525
2526 /*
2527 * This should be faster but all denormals will end up as infinity.
2528 */
2529 if (0 && lp_build_fast_rsqrt_available(type)) {
2530 const unsigned num_iterations = 1;
2531 LLVMValueRef res;
2532 unsigned i;
2533
2534 /* rsqrt(1.0) != 1.0 here */
2535 res = lp_build_fast_rsqrt(bld, a);
2536
2537 if (num_iterations) {
2538 /*
2539 * Newton-Raphson will result in NaN instead of infinity for zero,
2540 * and NaN instead of zero for infinity.
2541 * Also, need to ensure rsqrt(1.0) == 1.0.
2542 * All numbers smaller than FLT_MIN will result in +infinity
2543 * (rsqrtps treats all denormals as zero).
2544 */
2545 /*
2546 * Certain non-c99 compilers don't know INFINITY and might not support
2547 * hacks to evaluate it at compile time neither.
2548 */
2549 const unsigned posinf_int = 0x7F800000;
2550 LLVMValueRef cmp;
2551 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2552 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2553
2554 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2555
2556 for (i = 0; i < num_iterations; ++i) {
2557 res = lp_build_rsqrt_refine(bld, a, res);
2558 }
2559 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2560 res = lp_build_select(bld, cmp, inf, res);
2561 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2562 res = lp_build_select(bld, cmp, bld->zero, res);
2563 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2564 res = lp_build_select(bld, cmp, bld->one, res);
2565 }
2566
2567 return res;
2568 }
2569
2570 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2571 }
2572
2573 /**
2574 * If there's a fast (inaccurate) rsqrt instruction available
2575 * (caller may want to avoid to call rsqrt_fast if it's not available,
2576 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2577 * unavailable it would result in sqrt/div/mul so obviously
2578 * much better to just call sqrt, skipping both div and mul).
2579 */
2580 boolean
2581 lp_build_fast_rsqrt_available(struct lp_type type)
2582 {
2583 assert(type.floating);
2584
2585 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2586 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2587 return true;
2588 }
2589 return false;
2590 }
2591
2592
2593 /**
2594 * Generate 1/sqrt(a).
2595 * Result is undefined for values < 0, infinity for +0.
2596 * Precision is limited, only ~10 bits guaranteed
2597 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2598 */
2599 LLVMValueRef
2600 lp_build_fast_rsqrt(struct lp_build_context *bld,
2601 LLVMValueRef a)
2602 {
2603 LLVMBuilderRef builder = bld->gallivm->builder;
2604 const struct lp_type type = bld->type;
2605
2606 assert(lp_check_value(type, a));
2607
2608 if (lp_build_fast_rsqrt_available(type)) {
2609 const char *intrinsic = NULL;
2610
2611 if (type.length == 4) {
2612 intrinsic = "llvm.x86.sse.rsqrt.ps";
2613 }
2614 else {
2615 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2616 }
2617 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2618 }
2619 else {
2620 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2621 }
2622 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2623 }
2624
2625
2626 /**
2627 * Generate sin(a) or cos(a) using polynomial approximation.
2628 * TODO: it might be worth recognizing sin and cos using same source
2629 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2630 * would be way cheaper than calculating (nearly) everything twice...
2631 * Not sure it's common enough to be worth bothering however, scs
2632 * opcode could also benefit from calculating both though.
2633 */
2634 static LLVMValueRef
2635 lp_build_sin_or_cos(struct lp_build_context *bld,
2636 LLVMValueRef a,
2637 boolean cos)
2638 {
2639 struct gallivm_state *gallivm = bld->gallivm;
2640 LLVMBuilderRef b = gallivm->builder;
2641 struct lp_type int_type = lp_int_type(bld->type);
2642
2643 /*
2644 * take the absolute value,
2645 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2646 */
2647
2648 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2649 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2650
2651 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2652 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2653
2654 /*
2655 * scale by 4/Pi
2656 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2657 */
2658
2659 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2660 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2661
2662 /*
2663 * store the integer part of y in mm0
2664 * emm2 = _mm_cvttps_epi32(y);
2665 */
2666
2667 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2668
2669 /*
2670 * j=(j+1) & (~1) (see the cephes sources)
2671 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2672 */
2673
2674 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2675 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2676 /*
2677 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2678 */
2679 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2680 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2681
2682 /*
2683 * y = _mm_cvtepi32_ps(emm2);
2684 */
2685 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2686
2687 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2688 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2689 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2690 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2691
2692 /*
2693 * Argument used for poly selection and sign bit determination
2694 * is different for sin vs. cos.
2695 */
2696 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2697 emm2_and;
2698
2699 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2700 LLVMBuildNot(b, emm2_2, ""), ""),
2701 const_29, "sign_bit") :
2702 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2703 LLVMBuildShl(b, emm2_add,
2704 const_29, ""), ""),
2705 sign_mask, "sign_bit");
2706
2707 /*
2708 * get the polynom selection mask
2709 * there is one polynom for 0 <= x <= Pi/4
2710 * and another one for Pi/4<x<=Pi/2
2711 * Both branches will be computed.
2712 *
2713 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2714 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2715 */
2716
2717 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2718 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2719 int_type, PIPE_FUNC_EQUAL,
2720 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2721
2722 /*
2723 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2724 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2725 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2726 */
2727 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2728 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2729 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2730
2731 /*
2732 * The magic pass: "Extended precision modular arithmetic"
2733 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2734 * xmm1 = _mm_mul_ps(y, xmm1);
2735 * xmm2 = _mm_mul_ps(y, xmm2);
2736 * xmm3 = _mm_mul_ps(y, xmm3);
2737 */
2738 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2739 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2740 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2741
2742 /*
2743 * x = _mm_add_ps(x, xmm1);
2744 * x = _mm_add_ps(x, xmm2);
2745 * x = _mm_add_ps(x, xmm3);
2746 */
2747
2748 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2749 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2750 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2751
2752 /*
2753 * Evaluate the first polynom (0 <= x <= Pi/4)
2754 *
2755 * z = _mm_mul_ps(x,x);
2756 */
2757 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2758
2759 /*
2760 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2761 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2762 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2763 */
2764 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2765 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2766 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2767
2768 /*
2769 * y = *(v4sf*)_ps_coscof_p0;
2770 * y = _mm_mul_ps(y, z);
2771 */
2772 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2773 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2774 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2775 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2776 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2777 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2778
2779
2780 /*
2781 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2782 * y = _mm_sub_ps(y, tmp);
2783 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2784 */
2785 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2786 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2787 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2788 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2789 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2790
2791 /*
2792 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2793 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2794 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2795 */
2796 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2797 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2798 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2799
2800 /*
2801 * Evaluate the second polynom (Pi/4 <= x <= 0)
2802 *
2803 * y2 = *(v4sf*)_ps_sincof_p0;
2804 * y2 = _mm_mul_ps(y2, z);
2805 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2806 * y2 = _mm_mul_ps(y2, z);
2807 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2808 * y2 = _mm_mul_ps(y2, z);
2809 * y2 = _mm_mul_ps(y2, x);
2810 * y2 = _mm_add_ps(y2, x);
2811 */
2812
2813 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2814 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2815 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2816 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2817 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2818 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2819 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2820
2821 /*
2822 * select the correct result from the two polynoms
2823 * xmm3 = poly_mask;
2824 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2825 * y = _mm_andnot_ps(xmm3, y);
2826 * y = _mm_or_ps(y,y2);
2827 */
2828 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2829 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2830 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2831 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2832 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2833 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2834
2835 /*
2836 * update the sign
2837 * y = _mm_xor_ps(y, sign_bit);
2838 */
2839 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2840 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2841
2842 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2843
2844 /* clamp output to be within [-1, 1] */
2845 y_result = lp_build_clamp(bld, y_result,
2846 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2847 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2848 /* If a is -inf, inf or NaN then return NaN */
2849 y_result = lp_build_select(bld, isfinite, y_result,
2850 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2851 return y_result;
2852 }
2853
2854
2855 /**
2856 * Generate sin(a)
2857 */
2858 LLVMValueRef
2859 lp_build_sin(struct lp_build_context *bld,
2860 LLVMValueRef a)
2861 {
2862 return lp_build_sin_or_cos(bld, a, FALSE);
2863 }
2864
2865
2866 /**
2867 * Generate cos(a)
2868 */
2869 LLVMValueRef
2870 lp_build_cos(struct lp_build_context *bld,
2871 LLVMValueRef a)
2872 {
2873 return lp_build_sin_or_cos(bld, a, TRUE);
2874 }
2875
2876
2877 /**
2878 * Generate pow(x, y)
2879 */
2880 LLVMValueRef
2881 lp_build_pow(struct lp_build_context *bld,
2882 LLVMValueRef x,
2883 LLVMValueRef y)
2884 {
2885 /* TODO: optimize the constant case */
2886 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2887 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2888 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2889 __FUNCTION__);
2890 }
2891
2892 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2893 }
2894
2895
2896 /**
2897 * Generate exp(x)
2898 */
2899 LLVMValueRef
2900 lp_build_exp(struct lp_build_context *bld,
2901 LLVMValueRef x)
2902 {
2903 /* log2(e) = 1/log(2) */
2904 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2905 1.4426950408889634);
2906
2907 assert(lp_check_value(bld->type, x));
2908
2909 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2910 }
2911
2912
2913 /**
2914 * Generate log(x)
2915 * Behavior is undefined with infs, 0s and nans
2916 */
2917 LLVMValueRef
2918 lp_build_log(struct lp_build_context *bld,
2919 LLVMValueRef x)
2920 {
2921 /* log(2) */
2922 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2923 0.69314718055994529);
2924
2925 assert(lp_check_value(bld->type, x));
2926
2927 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2928 }
2929
2930 /**
2931 * Generate log(x) that handles edge cases (infs, 0s and nans)
2932 */
2933 LLVMValueRef
2934 lp_build_log_safe(struct lp_build_context *bld,
2935 LLVMValueRef x)
2936 {
2937 /* log(2) */
2938 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2939 0.69314718055994529);
2940
2941 assert(lp_check_value(bld->type, x));
2942
2943 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2944 }
2945
2946
2947 /**
2948 * Generate polynomial.
2949 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2950 */
2951 LLVMValueRef
2952 lp_build_polynomial(struct lp_build_context *bld,
2953 LLVMValueRef x,
2954 const double *coeffs,
2955 unsigned num_coeffs)
2956 {
2957 const struct lp_type type = bld->type;
2958 LLVMValueRef even = NULL, odd = NULL;
2959 LLVMValueRef x2;
2960 unsigned i;
2961
2962 assert(lp_check_value(bld->type, x));
2963
2964 /* TODO: optimize the constant case */
2965 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2966 LLVMIsConstant(x)) {
2967 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2968 __FUNCTION__);
2969 }
2970
2971 /*
2972 * Calculate odd and even terms seperately to decrease data dependency
2973 * Ex:
2974 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2975 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2976 */
2977 x2 = lp_build_mul(bld, x, x);
2978
2979 for (i = num_coeffs; i--; ) {
2980 LLVMValueRef coeff;
2981
2982 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2983
2984 if (i % 2 == 0) {
2985 if (even)
2986 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2987 else
2988 even = coeff;
2989 } else {
2990 if (odd)
2991 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2992 else
2993 odd = coeff;
2994 }
2995 }
2996
2997 if (odd)
2998 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2999 else if (even)
3000 return even;
3001 else
3002 return bld->undef;
3003 }
3004
3005
3006 /**
3007 * Minimax polynomial fit of 2**x, in range [0, 1[
3008 */
3009 const double lp_build_exp2_polynomial[] = {
3010 #if EXP_POLY_DEGREE == 5
3011 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3012 0.693153073200168932794,
3013 0.240153617044375388211,
3014 0.0558263180532956664775,
3015 0.00898934009049466391101,
3016 0.00187757667519147912699
3017 #elif EXP_POLY_DEGREE == 4
3018 1.00000259337069434683,
3019 0.693003834469974940458,
3020 0.24144275689150793076,
3021 0.0520114606103070150235,
3022 0.0135341679161270268764
3023 #elif EXP_POLY_DEGREE == 3
3024 0.999925218562710312959,
3025 0.695833540494823811697,
3026 0.226067155427249155588,
3027 0.0780245226406372992967
3028 #elif EXP_POLY_DEGREE == 2
3029 1.00172476321474503578,
3030 0.657636275736077639316,
3031 0.33718943461968720704
3032 #else
3033 #error
3034 #endif
3035 };
3036
3037
3038 LLVMValueRef
3039 lp_build_exp2(struct lp_build_context *bld,
3040 LLVMValueRef x)
3041 {
3042 LLVMBuilderRef builder = bld->gallivm->builder;
3043 const struct lp_type type = bld->type;
3044 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3045 LLVMValueRef ipart = NULL;
3046 LLVMValueRef fpart = NULL;
3047 LLVMValueRef expipart = NULL;
3048 LLVMValueRef expfpart = NULL;
3049 LLVMValueRef res = NULL;
3050
3051 assert(lp_check_value(bld->type, x));
3052
3053 /* TODO: optimize the constant case */
3054 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3055 LLVMIsConstant(x)) {
3056 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3057 __FUNCTION__);
3058 }
3059
3060 assert(type.floating && type.width == 32);
3061
3062 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3063 * the result is INF and if it's smaller than -126.9 the result is 0 */
3064 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3065 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3066 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3067 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3068
3069 /* ipart = floor(x) */
3070 /* fpart = x - ipart */
3071 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3072
3073 /* expipart = (float) (1 << ipart) */
3074 expipart = LLVMBuildAdd(builder, ipart,
3075 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3076 expipart = LLVMBuildShl(builder, expipart,
3077 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3078 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3079
3080 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3081 Elements(lp_build_exp2_polynomial));
3082
3083 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3084
3085 return res;
3086 }
3087
3088
3089
3090 /**
3091 * Extract the exponent of a IEEE-754 floating point value.
3092 *
3093 * Optionally apply an integer bias.
3094 *
3095 * Result is an integer value with
3096 *
3097 * ifloor(log2(x)) + bias
3098 */
3099 LLVMValueRef
3100 lp_build_extract_exponent(struct lp_build_context *bld,
3101 LLVMValueRef x,
3102 int bias)
3103 {
3104 LLVMBuilderRef builder = bld->gallivm->builder;
3105 const struct lp_type type = bld->type;
3106 unsigned mantissa = lp_mantissa(type);
3107 LLVMValueRef res;
3108
3109 assert(type.floating);
3110
3111 assert(lp_check_value(bld->type, x));
3112
3113 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3114
3115 res = LLVMBuildLShr(builder, x,
3116 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3117 res = LLVMBuildAnd(builder, res,
3118 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3119 res = LLVMBuildSub(builder, res,
3120 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3121
3122 return res;
3123 }
3124
3125
3126 /**
3127 * Extract the mantissa of the a floating.
3128 *
3129 * Result is a floating point value with
3130 *
3131 * x / floor(log2(x))
3132 */
3133 LLVMValueRef
3134 lp_build_extract_mantissa(struct lp_build_context *bld,
3135 LLVMValueRef x)
3136 {
3137 LLVMBuilderRef builder = bld->gallivm->builder;
3138 const struct lp_type type = bld->type;
3139 unsigned mantissa = lp_mantissa(type);
3140 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3141 (1ULL << mantissa) - 1);
3142 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3143 LLVMValueRef res;
3144
3145 assert(lp_check_value(bld->type, x));
3146
3147 assert(type.floating);
3148
3149 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3150
3151 /* res = x / 2**ipart */
3152 res = LLVMBuildAnd(builder, x, mantmask, "");
3153 res = LLVMBuildOr(builder, res, one, "");
3154 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3155
3156 return res;
3157 }
3158
3159
3160
3161 /**
3162 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3163 * These coefficients can be generate with
3164 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3165 */
3166 const double lp_build_log2_polynomial[] = {
3167 #if LOG_POLY_DEGREE == 5
3168 2.88539008148777786488L,
3169 0.961796878841293367824L,
3170 0.577058946784739859012L,
3171 0.412914355135828735411L,
3172 0.308591899232910175289L,
3173 0.352376952300281371868L,
3174 #elif LOG_POLY_DEGREE == 4
3175 2.88539009343309178325L,
3176 0.961791550404184197881L,
3177 0.577440339438736392009L,
3178 0.403343858251329912514L,
3179 0.406718052498846252698L,
3180 #elif LOG_POLY_DEGREE == 3
3181 2.88538959748872753838L,
3182 0.961932915889597772928L,
3183 0.571118517972136195241L,
3184 0.493997535084709500285L,
3185 #else
3186 #error
3187 #endif
3188 };
3189
3190 /**
3191 * See http://www.devmaster.net/forums/showthread.php?p=43580
3192 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3193 * http://www.nezumi.demon.co.uk/consult/logx.htm
3194 *
3195 * If handle_edge_cases is true the function will perform computations
3196 * to match the required D3D10+ behavior for each of the edge cases.
3197 * That means that if input is:
3198 * - less than zero (to and including -inf) then NaN will be returned
3199 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3200 * - +infinity, then +infinity will be returned
3201 * - NaN, then NaN will be returned
3202 *
3203 * Those checks are fairly expensive so if you don't need them make sure
3204 * handle_edge_cases is false.
3205 */
3206 void
3207 lp_build_log2_approx(struct lp_build_context *bld,
3208 LLVMValueRef x,
3209 LLVMValueRef *p_exp,
3210 LLVMValueRef *p_floor_log2,
3211 LLVMValueRef *p_log2,
3212 boolean handle_edge_cases)
3213 {
3214 LLVMBuilderRef builder = bld->gallivm->builder;
3215 const struct lp_type type = bld->type;
3216 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3217 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3218
3219 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3220 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3221 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3222
3223 LLVMValueRef i = NULL;
3224 LLVMValueRef y = NULL;
3225 LLVMValueRef z = NULL;
3226 LLVMValueRef exp = NULL;
3227 LLVMValueRef mant = NULL;
3228 LLVMValueRef logexp = NULL;
3229 LLVMValueRef logmant = NULL;
3230 LLVMValueRef res = NULL;
3231
3232 assert(lp_check_value(bld->type, x));
3233
3234 if(p_exp || p_floor_log2 || p_log2) {
3235 /* TODO: optimize the constant case */
3236 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3237 LLVMIsConstant(x)) {
3238 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3239 __FUNCTION__);
3240 }
3241
3242 assert(type.floating && type.width == 32);
3243
3244 /*
3245 * We don't explicitly handle denormalized numbers. They will yield a
3246 * result in the neighbourhood of -127, which appears to be adequate
3247 * enough.
3248 */
3249
3250 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3251
3252 /* exp = (float) exponent(x) */
3253 exp = LLVMBuildAnd(builder, i, expmask, "");
3254 }
3255
3256 if(p_floor_log2 || p_log2) {
3257 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3258 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3259 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3260 }
3261
3262 if(p_log2) {
3263 /* mant = 1 + (float) mantissa(x) */
3264 mant = LLVMBuildAnd(builder, i, mantmask, "");
3265 mant = LLVMBuildOr(builder, mant, one, "");
3266 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3267
3268 /* y = (mant - 1) / (mant + 1) */
3269 y = lp_build_div(bld,
3270 lp_build_sub(bld, mant, bld->one),
3271 lp_build_add(bld, mant, bld->one)
3272 );
3273
3274 /* z = y^2 */
3275 z = lp_build_mul(bld, y, y);
3276
3277 /* compute P(z) */
3278 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3279 Elements(lp_build_log2_polynomial));
3280
3281 /* logmant = y * P(z) */
3282 logmant = lp_build_mul(bld, y, logmant);
3283
3284 res = lp_build_add(bld, logmant, logexp);
3285
3286 if (type.floating && handle_edge_cases) {
3287 LLVMValueRef negmask, infmask, zmask;
3288 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3289 lp_build_const_vec(bld->gallivm, type, 0.0f));
3290 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3291 lp_build_const_vec(bld->gallivm, type, 0.0f));
3292 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3293 lp_build_const_vec(bld->gallivm, type, INFINITY));
3294
3295 /* If x is qual to inf make sure we return inf */
3296 res = lp_build_select(bld, infmask,
3297 lp_build_const_vec(bld->gallivm, type, INFINITY),
3298 res);
3299 /* If x is qual to 0, return -inf */
3300 res = lp_build_select(bld, zmask,
3301 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3302 res);
3303 /* If x is nan or less than 0, return nan */
3304 res = lp_build_select(bld, negmask,
3305 lp_build_const_vec(bld->gallivm, type, NAN),
3306 res);
3307 }
3308 }
3309
3310 if(p_exp) {
3311 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3312 *p_exp = exp;
3313 }
3314
3315 if(p_floor_log2)
3316 *p_floor_log2 = logexp;
3317
3318 if(p_log2)
3319 *p_log2 = res;
3320 }
3321
3322
3323 /*
3324 * log2 implementation which doesn't have special code to
3325 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3326 * the results for those cases are undefined.
3327 */
3328 LLVMValueRef
3329 lp_build_log2(struct lp_build_context *bld,
3330 LLVMValueRef x)
3331 {
3332 LLVMValueRef res;
3333 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3334 return res;
3335 }
3336
3337 /*
3338 * Version of log2 which handles all edge cases.
3339 * Look at documentation of lp_build_log2_approx for
3340 * description of the behavior for each of the edge cases.
3341 */
3342 LLVMValueRef
3343 lp_build_log2_safe(struct lp_build_context *bld,
3344 LLVMValueRef x)
3345 {
3346 LLVMValueRef res;
3347 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3348 return res;
3349 }
3350
3351
3352 /**
3353 * Faster (and less accurate) log2.
3354 *
3355 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3356 *
3357 * Piece-wise linear approximation, with exact results when x is a
3358 * power of two.
3359 *
3360 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3361 */
3362 LLVMValueRef
3363 lp_build_fast_log2(struct lp_build_context *bld,
3364 LLVMValueRef x)
3365 {
3366 LLVMBuilderRef builder = bld->gallivm->builder;
3367 LLVMValueRef ipart;
3368 LLVMValueRef fpart;
3369
3370 assert(lp_check_value(bld->type, x));
3371
3372 assert(bld->type.floating);
3373
3374 /* ipart = floor(log2(x)) - 1 */
3375 ipart = lp_build_extract_exponent(bld, x, -1);
3376 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3377
3378 /* fpart = x / 2**ipart */
3379 fpart = lp_build_extract_mantissa(bld, x);
3380
3381 /* ipart + fpart */
3382 return LLVMBuildFAdd(builder, ipart, fpart, "");
3383 }
3384
3385
3386 /**
3387 * Fast implementation of iround(log2(x)).
3388 *
3389 * Not an approximation -- it should give accurate results all the time.
3390 */
3391 LLVMValueRef
3392 lp_build_ilog2(struct lp_build_context *bld,
3393 LLVMValueRef x)
3394 {
3395 LLVMBuilderRef builder = bld->gallivm->builder;
3396 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3397 LLVMValueRef ipart;
3398
3399 assert(bld->type.floating);
3400
3401 assert(lp_check_value(bld->type, x));
3402
3403 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3404 x = LLVMBuildFMul(builder, x, sqrt2, "");
3405
3406 /* ipart = floor(log2(x) + 0.5) */
3407 ipart = lp_build_extract_exponent(bld, x, 0);
3408
3409 return ipart;
3410 }
3411
3412 LLVMValueRef
3413 lp_build_mod(struct lp_build_context *bld,
3414 LLVMValueRef x,
3415 LLVMValueRef y)
3416 {
3417 LLVMBuilderRef builder = bld->gallivm->builder;
3418 LLVMValueRef res;
3419 const struct lp_type type = bld->type;
3420
3421 assert(lp_check_value(type, x));
3422 assert(lp_check_value(type, y));
3423
3424 if (type.floating)
3425 res = LLVMBuildFRem(builder, x, y, "");
3426 else if (type.sign)
3427 res = LLVMBuildSRem(builder, x, y, "");
3428 else
3429 res = LLVMBuildURem(builder, x, y, "");
3430 return res;
3431 }
3432
3433
3434 /*
3435 * For floating inputs it creates and returns a mask
3436 * which is all 1's for channels which are NaN.
3437 * Channels inside x which are not NaN will be 0.
3438 */
3439 LLVMValueRef
3440 lp_build_isnan(struct lp_build_context *bld,
3441 LLVMValueRef x)
3442 {
3443 LLVMValueRef mask;
3444 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3445
3446 assert(bld->type.floating);
3447 assert(lp_check_value(bld->type, x));
3448
3449 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3450 "isnotnan");
3451 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3452 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3453 return mask;
3454 }
3455
3456 /* Returns all 1's for floating point numbers that are
3457 * finite numbers and returns all zeros for -inf,
3458 * inf and nan's */
3459 LLVMValueRef
3460 lp_build_isfinite(struct lp_build_context *bld,
3461 LLVMValueRef x)
3462 {
3463 LLVMBuilderRef builder = bld->gallivm->builder;
3464 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3465 struct lp_type int_type = lp_int_type(bld->type);
3466 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3467 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3468 0x7f800000);
3469
3470 if (!bld->type.floating) {
3471 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3472 }
3473 assert(bld->type.floating);
3474 assert(lp_check_value(bld->type, x));
3475 assert(bld->type.width == 32);
3476
3477 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3478 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3479 intx, infornan32);
3480 }
3481
3482 /*
3483 * Returns true if the number is nan or inf and false otherwise.
3484 * The input has to be a floating point vector.
3485 */
3486 LLVMValueRef
3487 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3488 const struct lp_type type,
3489 LLVMValueRef x)
3490 {
3491 LLVMBuilderRef builder = gallivm->builder;
3492 struct lp_type int_type = lp_int_type(type);
3493 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3494 0x7f800000);
3495 LLVMValueRef ret;
3496
3497 assert(type.floating);
3498
3499 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3500 ret = LLVMBuildAnd(builder, ret, const0, "");
3501 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3502 ret, const0);
3503
3504 return ret;
3505 }
3506
3507
3508 LLVMValueRef
3509 lp_build_fpstate_get(struct gallivm_state *gallivm)
3510 {
3511 if (util_cpu_caps.has_sse) {
3512 LLVMBuilderRef builder = gallivm->builder;
3513 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3514 gallivm,
3515 LLVMInt32TypeInContext(gallivm->context),
3516 "mxcsr_ptr");
3517 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3518 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3519 lp_build_intrinsic(builder,
3520 "llvm.x86.sse.stmxcsr",
3521 LLVMVoidTypeInContext(gallivm->context),
3522 &mxcsr_ptr8, 1);
3523 return mxcsr_ptr;
3524 }
3525 return 0;
3526 }
3527
3528 void
3529 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3530 boolean zero)
3531 {
3532 if (util_cpu_caps.has_sse) {
3533 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3534 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3535
3536 LLVMBuilderRef builder = gallivm->builder;
3537 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3538 LLVMValueRef mxcsr =
3539 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3540
3541 if (util_cpu_caps.has_daz) {
3542 /* Enable denormals are zero mode */
3543 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3544 }
3545 if (zero) {
3546 mxcsr = LLVMBuildOr(builder, mxcsr,
3547 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3548 } else {
3549 mxcsr = LLVMBuildAnd(builder, mxcsr,
3550 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3551 }
3552
3553 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3554 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3555 }
3556 }
3557
3558 void
3559 lp_build_fpstate_set(struct gallivm_state *gallivm,
3560 LLVMValueRef mxcsr_ptr)
3561 {
3562 if (util_cpu_caps.has_sse) {
3563 LLVMBuilderRef builder = gallivm->builder;
3564 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3565 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3566 lp_build_intrinsic(builder,
3567 "llvm.x86.sse.ldmxcsr",
3568 LLVMVoidTypeInContext(gallivm->context),
3569 &mxcsr_ptr, 1);
3570 }
3571 }