gallivm: fix pointer type for stmxcsr/ldmxcsr
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
69 #endif
70
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
73 #endif
74
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
77 #endif
78
79 #define EXP_POLY_DEGREE 5
80
81 #define LOG_POLY_DEGREE 4
82
83
84 /**
85 * Generate min(a, b)
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
89 */
90 static LLVMValueRef
91 lp_build_min_simple(struct lp_build_context *bld,
92 LLVMValueRef a,
93 LLVMValueRef b,
94 enum gallivm_nan_behavior nan_behavior)
95 {
96 const struct lp_type type = bld->type;
97 const char *intrinsic = NULL;
98 unsigned intr_size = 0;
99 LLVMValueRef cond;
100
101 assert(lp_check_value(type, a));
102 assert(lp_check_value(type, b));
103
104 /* TODO: optimize the constant case */
105
106 if (type.floating && util_cpu_caps.has_sse) {
107 if (type.width == 32) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse.min.ss";
110 intr_size = 128;
111 }
112 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse.min.ps";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.ps.256";
118 intr_size = 256;
119 }
120 }
121 if (type.width == 64 && util_cpu_caps.has_sse2) {
122 if (type.length == 1) {
123 intrinsic = "llvm.x86.sse2.min.sd";
124 intr_size = 128;
125 }
126 else if (type.length == 2 || !util_cpu_caps.has_avx) {
127 intrinsic = "llvm.x86.sse2.min.pd";
128 intr_size = 128;
129 }
130 else {
131 intrinsic = "llvm.x86.avx.min.pd.256";
132 intr_size = 256;
133 }
134 }
135 }
136 else if (type.floating && util_cpu_caps.has_altivec) {
137 if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
146 intr_size = 128;
147 if ((type.width == 8 || type.width == 16) &&
148 (type.width * type.length <= 64) &&
149 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
150 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
151 __FUNCTION__);
152 }
153 if (type.width == 8 && !type.sign) {
154 intrinsic = "llvm.x86.sse2.pminu.b";
155 }
156 else if (type.width == 16 && type.sign) {
157 intrinsic = "llvm.x86.sse2.pmins.w";
158 }
159 if (util_cpu_caps.has_sse4_1) {
160 if (type.width == 8 && type.sign) {
161 intrinsic = "llvm.x86.sse41.pminsb";
162 }
163 if (type.width == 16 && !type.sign) {
164 intrinsic = "llvm.x86.sse41.pminuw";
165 }
166 if (type.width == 32 && !type.sign) {
167 intrinsic = "llvm.x86.sse41.pminud";
168 }
169 if (type.width == 32 && type.sign) {
170 intrinsic = "llvm.x86.sse41.pminsd";
171 }
172 }
173 } else if (util_cpu_caps.has_altivec) {
174 intr_size = 128;
175 if (type.width == 8) {
176 if (!type.sign) {
177 intrinsic = "llvm.ppc.altivec.vminub";
178 } else {
179 intrinsic = "llvm.ppc.altivec.vminsb";
180 }
181 } else if (type.width == 16) {
182 if (!type.sign) {
183 intrinsic = "llvm.ppc.altivec.vminuh";
184 } else {
185 intrinsic = "llvm.ppc.altivec.vminsh";
186 }
187 } else if (type.width == 32) {
188 if (!type.sign) {
189 intrinsic = "llvm.ppc.altivec.vminuw";
190 } else {
191 intrinsic = "llvm.ppc.altivec.vminsw";
192 }
193 }
194 }
195
196 if(intrinsic) {
197 /* We need to handle nan's for floating point numbers. If one of the
198 * inputs is nan the other should be returned (required by both D3D10+
199 * and OpenCL).
200 * The sse intrinsics return the second operator in case of nan by
201 * default so we need to special code to handle those.
202 */
203 if (util_cpu_caps.has_sse && type.floating &&
204 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
205 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
206 LLVMValueRef isnan, max;
207 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
208 type,
209 intr_size, a, b);
210 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
211 isnan = lp_build_isnan(bld, b);
212 return lp_build_select(bld, isnan, a, max);
213 } else {
214 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
215 isnan = lp_build_isnan(bld, a);
216 return lp_build_select(bld, isnan, a, max);
217 }
218 } else {
219 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
220 type,
221 intr_size, a, b);
222 }
223 }
224
225 if (type.floating) {
226 switch (nan_behavior) {
227 case GALLIVM_NAN_RETURN_NAN: {
228 LLVMValueRef isnan = lp_build_isnan(bld, b);
229 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
230 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
231 return lp_build_select(bld, cond, a, b);
232 }
233 break;
234 case GALLIVM_NAN_RETURN_OTHER: {
235 LLVMValueRef isnan = lp_build_isnan(bld, a);
236 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
237 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
238 return lp_build_select(bld, cond, a, b);
239 }
240 break;
241 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
242 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
243 return lp_build_select(bld, cond, a, b);
244 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 return lp_build_select(bld, cond, a, b);
247 break;
248 default:
249 assert(0);
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 }
253 } else {
254 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
255 return lp_build_select(bld, cond, a, b);
256 }
257 }
258
259
260 /**
261 * Generate max(a, b)
262 * No checks for special case values of a or b = 1 or 0 are done.
263 * NaN's are handled according to the behavior specified by the
264 * nan_behavior argument.
265 */
266 static LLVMValueRef
267 lp_build_max_simple(struct lp_build_context *bld,
268 LLVMValueRef a,
269 LLVMValueRef b,
270 enum gallivm_nan_behavior nan_behavior)
271 {
272 const struct lp_type type = bld->type;
273 const char *intrinsic = NULL;
274 unsigned intr_size = 0;
275 LLVMValueRef cond;
276
277 assert(lp_check_value(type, a));
278 assert(lp_check_value(type, b));
279
280 /* TODO: optimize the constant case */
281
282 if (type.floating && util_cpu_caps.has_sse) {
283 if (type.width == 32) {
284 if (type.length == 1) {
285 intrinsic = "llvm.x86.sse.max.ss";
286 intr_size = 128;
287 }
288 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
289 intrinsic = "llvm.x86.sse.max.ps";
290 intr_size = 128;
291 }
292 else {
293 intrinsic = "llvm.x86.avx.max.ps.256";
294 intr_size = 256;
295 }
296 }
297 if (type.width == 64 && util_cpu_caps.has_sse2) {
298 if (type.length == 1) {
299 intrinsic = "llvm.x86.sse2.max.sd";
300 intr_size = 128;
301 }
302 else if (type.length == 2 || !util_cpu_caps.has_avx) {
303 intrinsic = "llvm.x86.sse2.max.pd";
304 intr_size = 128;
305 }
306 else {
307 intrinsic = "llvm.x86.avx.max.pd.256";
308 intr_size = 256;
309 }
310 }
311 }
312 else if (type.floating && util_cpu_caps.has_altivec) {
313 if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
314 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
315 __FUNCTION__);
316 }
317 if (type.width == 32 || type.length == 4) {
318 intrinsic = "llvm.ppc.altivec.vmaxfp";
319 intr_size = 128;
320 }
321 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
322 intr_size = 128;
323 if ((type.width == 8 || type.width == 16) &&
324 (type.width * type.length <= 64) &&
325 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
326 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
327 __FUNCTION__);
328 }
329 if (type.width == 8 && !type.sign) {
330 intrinsic = "llvm.x86.sse2.pmaxu.b";
331 intr_size = 128;
332 }
333 else if (type.width == 16 && type.sign) {
334 intrinsic = "llvm.x86.sse2.pmaxs.w";
335 }
336 if (util_cpu_caps.has_sse4_1) {
337 if (type.width == 8 && type.sign) {
338 intrinsic = "llvm.x86.sse41.pmaxsb";
339 }
340 if (type.width == 16 && !type.sign) {
341 intrinsic = "llvm.x86.sse41.pmaxuw";
342 }
343 if (type.width == 32 && !type.sign) {
344 intrinsic = "llvm.x86.sse41.pmaxud";
345 }
346 if (type.width == 32 && type.sign) {
347 intrinsic = "llvm.x86.sse41.pmaxsd";
348 }
349 }
350 } else if (util_cpu_caps.has_altivec) {
351 intr_size = 128;
352 if (type.width == 8) {
353 if (!type.sign) {
354 intrinsic = "llvm.ppc.altivec.vmaxub";
355 } else {
356 intrinsic = "llvm.ppc.altivec.vmaxsb";
357 }
358 } else if (type.width == 16) {
359 if (!type.sign) {
360 intrinsic = "llvm.ppc.altivec.vmaxuh";
361 } else {
362 intrinsic = "llvm.ppc.altivec.vmaxsh";
363 }
364 } else if (type.width == 32) {
365 if (!type.sign) {
366 intrinsic = "llvm.ppc.altivec.vmaxuw";
367 } else {
368 intrinsic = "llvm.ppc.altivec.vmaxsw";
369 }
370 }
371 }
372
373 if(intrinsic) {
374 if (util_cpu_caps.has_sse && type.floating &&
375 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
376 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
377 LLVMValueRef isnan, min;
378 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
379 type,
380 intr_size, a, b);
381 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
382 isnan = lp_build_isnan(bld, b);
383 return lp_build_select(bld, isnan, a, min);
384 } else {
385 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
386 isnan = lp_build_isnan(bld, a);
387 return lp_build_select(bld, isnan, a, min);
388 }
389 } else {
390 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
391 type,
392 intr_size, a, b);
393 }
394 }
395
396 if (type.floating) {
397 switch (nan_behavior) {
398 case GALLIVM_NAN_RETURN_NAN: {
399 LLVMValueRef isnan = lp_build_isnan(bld, b);
400 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
401 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
402 return lp_build_select(bld, cond, a, b);
403 }
404 break;
405 case GALLIVM_NAN_RETURN_OTHER: {
406 LLVMValueRef isnan = lp_build_isnan(bld, a);
407 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
408 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
409 return lp_build_select(bld, cond, a, b);
410 }
411 break;
412 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
413 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
414 return lp_build_select(bld, cond, a, b);
415 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
416 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
417 return lp_build_select(bld, cond, a, b);
418 break;
419 default:
420 assert(0);
421 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
422 return lp_build_select(bld, cond, a, b);
423 }
424 } else {
425 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
426 return lp_build_select(bld, cond, a, b);
427 }
428 }
429
430
431 /**
432 * Generate 1 - a, or ~a depending on bld->type.
433 */
434 LLVMValueRef
435 lp_build_comp(struct lp_build_context *bld,
436 LLVMValueRef a)
437 {
438 LLVMBuilderRef builder = bld->gallivm->builder;
439 const struct lp_type type = bld->type;
440
441 assert(lp_check_value(type, a));
442
443 if(a == bld->one)
444 return bld->zero;
445 if(a == bld->zero)
446 return bld->one;
447
448 if(type.norm && !type.floating && !type.fixed && !type.sign) {
449 if(LLVMIsConstant(a))
450 return LLVMConstNot(a);
451 else
452 return LLVMBuildNot(builder, a, "");
453 }
454
455 if(LLVMIsConstant(a))
456 if (type.floating)
457 return LLVMConstFSub(bld->one, a);
458 else
459 return LLVMConstSub(bld->one, a);
460 else
461 if (type.floating)
462 return LLVMBuildFSub(builder, bld->one, a, "");
463 else
464 return LLVMBuildSub(builder, bld->one, a, "");
465 }
466
467
468 /**
469 * Generate a + b
470 */
471 LLVMValueRef
472 lp_build_add(struct lp_build_context *bld,
473 LLVMValueRef a,
474 LLVMValueRef b)
475 {
476 LLVMBuilderRef builder = bld->gallivm->builder;
477 const struct lp_type type = bld->type;
478 LLVMValueRef res;
479
480 assert(lp_check_value(type, a));
481 assert(lp_check_value(type, b));
482
483 if(a == bld->zero)
484 return b;
485 if(b == bld->zero)
486 return a;
487 if(a == bld->undef || b == bld->undef)
488 return bld->undef;
489
490 if(bld->type.norm) {
491 const char *intrinsic = NULL;
492
493 if(a == bld->one || b == bld->one)
494 return bld->one;
495
496 if (type.width * type.length == 128 &&
497 !type.floating && !type.fixed) {
498 if(util_cpu_caps.has_sse2) {
499 if(type.width == 8)
500 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
501 if(type.width == 16)
502 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
503 } else if (util_cpu_caps.has_altivec) {
504 if(type.width == 8)
505 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
506 if(type.width == 16)
507 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
508 }
509 }
510
511 if(intrinsic)
512 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
513 }
514
515 /* TODO: handle signed case */
516 if(type.norm && !type.floating && !type.fixed && !type.sign)
517 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
518
519 if(LLVMIsConstant(a) && LLVMIsConstant(b))
520 if (type.floating)
521 res = LLVMConstFAdd(a, b);
522 else
523 res = LLVMConstAdd(a, b);
524 else
525 if (type.floating)
526 res = LLVMBuildFAdd(builder, a, b, "");
527 else
528 res = LLVMBuildAdd(builder, a, b, "");
529
530 /* clamp to ceiling of 1.0 */
531 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
532 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
533
534 /* XXX clamp to floor of -1 or 0??? */
535
536 return res;
537 }
538
539
540 /** Return the scalar sum of the elements of a.
541 * Should avoid this operation whenever possible.
542 */
543 LLVMValueRef
544 lp_build_horizontal_add(struct lp_build_context *bld,
545 LLVMValueRef a)
546 {
547 LLVMBuilderRef builder = bld->gallivm->builder;
548 const struct lp_type type = bld->type;
549 LLVMValueRef index, res;
550 unsigned i, length;
551 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
552 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
553 LLVMValueRef vecres, elem2;
554
555 assert(lp_check_value(type, a));
556
557 if (type.length == 1) {
558 return a;
559 }
560
561 assert(!bld->type.norm);
562
563 /*
564 * for byte vectors can do much better with psadbw.
565 * Using repeated shuffle/adds here. Note with multiple vectors
566 * this can be done more efficiently as outlined in the intel
567 * optimization manual.
568 * Note: could cause data rearrangement if used with smaller element
569 * sizes.
570 */
571
572 vecres = a;
573 length = type.length / 2;
574 while (length > 1) {
575 LLVMValueRef vec1, vec2;
576 for (i = 0; i < length; i++) {
577 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
578 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
579 }
580 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
581 LLVMConstVector(shuffles1, length), "");
582 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
583 LLVMConstVector(shuffles2, length), "");
584 if (type.floating) {
585 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
586 }
587 else {
588 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
589 }
590 length = length >> 1;
591 }
592
593 /* always have vector of size 2 here */
594 assert(length == 1);
595
596 index = lp_build_const_int32(bld->gallivm, 0);
597 res = LLVMBuildExtractElement(builder, vecres, index, "");
598 index = lp_build_const_int32(bld->gallivm, 1);
599 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
600
601 if (type.floating)
602 res = LLVMBuildFAdd(builder, res, elem2, "");
603 else
604 res = LLVMBuildAdd(builder, res, elem2, "");
605
606 return res;
607 }
608
609 /**
610 * Return the horizontal sums of 4 float vectors as a float4 vector.
611 * This uses the technique as outlined in Intel Optimization Manual.
612 */
613 static LLVMValueRef
614 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
615 LLVMValueRef src[4])
616 {
617 struct gallivm_state *gallivm = bld->gallivm;
618 LLVMBuilderRef builder = gallivm->builder;
619 LLVMValueRef shuffles[4];
620 LLVMValueRef tmp[4];
621 LLVMValueRef sumtmp[2], shuftmp[2];
622
623 /* lower half of regs */
624 shuffles[0] = lp_build_const_int32(gallivm, 0);
625 shuffles[1] = lp_build_const_int32(gallivm, 1);
626 shuffles[2] = lp_build_const_int32(gallivm, 4);
627 shuffles[3] = lp_build_const_int32(gallivm, 5);
628 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
629 LLVMConstVector(shuffles, 4), "");
630 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
631 LLVMConstVector(shuffles, 4), "");
632
633 /* upper half of regs */
634 shuffles[0] = lp_build_const_int32(gallivm, 2);
635 shuffles[1] = lp_build_const_int32(gallivm, 3);
636 shuffles[2] = lp_build_const_int32(gallivm, 6);
637 shuffles[3] = lp_build_const_int32(gallivm, 7);
638 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
639 LLVMConstVector(shuffles, 4), "");
640 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
641 LLVMConstVector(shuffles, 4), "");
642
643 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
644 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
645
646 shuffles[0] = lp_build_const_int32(gallivm, 0);
647 shuffles[1] = lp_build_const_int32(gallivm, 2);
648 shuffles[2] = lp_build_const_int32(gallivm, 4);
649 shuffles[3] = lp_build_const_int32(gallivm, 6);
650 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
651 LLVMConstVector(shuffles, 4), "");
652
653 shuffles[0] = lp_build_const_int32(gallivm, 1);
654 shuffles[1] = lp_build_const_int32(gallivm, 3);
655 shuffles[2] = lp_build_const_int32(gallivm, 5);
656 shuffles[3] = lp_build_const_int32(gallivm, 7);
657 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
658 LLVMConstVector(shuffles, 4), "");
659
660 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
661 }
662
663
664 /*
665 * partially horizontally add 2-4 float vectors with length nx4,
666 * i.e. only four adjacent values in each vector will be added,
667 * assuming values are really grouped in 4 which also determines
668 * output order.
669 *
670 * Return a vector of the same length as the initial vectors,
671 * with the excess elements (if any) being undefined.
672 * The element order is independent of number of input vectors.
673 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
674 * the output order thus will be
675 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
676 */
677 LLVMValueRef
678 lp_build_hadd_partial4(struct lp_build_context *bld,
679 LLVMValueRef vectors[],
680 unsigned num_vecs)
681 {
682 struct gallivm_state *gallivm = bld->gallivm;
683 LLVMBuilderRef builder = gallivm->builder;
684 LLVMValueRef ret_vec;
685 LLVMValueRef tmp[4];
686 const char *intrinsic = NULL;
687
688 assert(num_vecs >= 2 && num_vecs <= 4);
689 assert(bld->type.floating);
690
691 /* only use this with at least 2 vectors, as it is sort of expensive
692 * (depending on cpu) and we always need two horizontal adds anyway,
693 * so a shuffle/add approach might be better.
694 */
695
696 tmp[0] = vectors[0];
697 tmp[1] = vectors[1];
698
699 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
700 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
701
702 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
703 bld->type.length == 4) {
704 intrinsic = "llvm.x86.sse3.hadd.ps";
705 }
706 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
707 bld->type.length == 8) {
708 intrinsic = "llvm.x86.avx.hadd.ps.256";
709 }
710 if (intrinsic) {
711 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
712 lp_build_vec_type(gallivm, bld->type),
713 tmp[0], tmp[1]);
714 if (num_vecs > 2) {
715 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
716 lp_build_vec_type(gallivm, bld->type),
717 tmp[2], tmp[3]);
718 }
719 else {
720 tmp[1] = tmp[0];
721 }
722 return lp_build_intrinsic_binary(builder, intrinsic,
723 lp_build_vec_type(gallivm, bld->type),
724 tmp[0], tmp[1]);
725 }
726
727 if (bld->type.length == 4) {
728 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
729 }
730 else {
731 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
732 unsigned j;
733 unsigned num_iter = bld->type.length / 4;
734 struct lp_type parttype = bld->type;
735 parttype.length = 4;
736 for (j = 0; j < num_iter; j++) {
737 LLVMValueRef partsrc[4];
738 unsigned i;
739 for (i = 0; i < 4; i++) {
740 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
741 }
742 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
743 }
744 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
745 }
746 return ret_vec;
747 }
748
749 /**
750 * Generate a - b
751 */
752 LLVMValueRef
753 lp_build_sub(struct lp_build_context *bld,
754 LLVMValueRef a,
755 LLVMValueRef b)
756 {
757 LLVMBuilderRef builder = bld->gallivm->builder;
758 const struct lp_type type = bld->type;
759 LLVMValueRef res;
760
761 assert(lp_check_value(type, a));
762 assert(lp_check_value(type, b));
763
764 if(b == bld->zero)
765 return a;
766 if(a == bld->undef || b == bld->undef)
767 return bld->undef;
768 if(a == b)
769 return bld->zero;
770
771 if(bld->type.norm) {
772 const char *intrinsic = NULL;
773
774 if(b == bld->one)
775 return bld->zero;
776
777 if (type.width * type.length == 128 &&
778 !type.floating && !type.fixed) {
779 if (util_cpu_caps.has_sse2) {
780 if(type.width == 8)
781 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
782 if(type.width == 16)
783 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
784 } else if (util_cpu_caps.has_altivec) {
785 if(type.width == 8)
786 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
787 if(type.width == 16)
788 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
789 }
790 }
791
792 if(intrinsic)
793 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
794 }
795
796 /* TODO: handle signed case */
797 if(type.norm && !type.floating && !type.fixed && !type.sign)
798 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
799
800 if(LLVMIsConstant(a) && LLVMIsConstant(b))
801 if (type.floating)
802 res = LLVMConstFSub(a, b);
803 else
804 res = LLVMConstSub(a, b);
805 else
806 if (type.floating)
807 res = LLVMBuildFSub(builder, a, b, "");
808 else
809 res = LLVMBuildSub(builder, a, b, "");
810
811 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
812 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
813
814 return res;
815 }
816
817
818
819 /**
820 * Normalized multiplication.
821 *
822 * There are several approaches for (using 8-bit normalized multiplication as
823 * an example):
824 *
825 * - alpha plus one
826 *
827 * makes the following approximation to the division (Sree)
828 *
829 * a*b/255 ~= (a*(b + 1)) >> 256
830 *
831 * which is the fastest method that satisfies the following OpenGL criteria of
832 *
833 * 0*0 = 0 and 255*255 = 255
834 *
835 * - geometric series
836 *
837 * takes the geometric series approximation to the division
838 *
839 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
840 *
841 * in this case just the first two terms to fit in 16bit arithmetic
842 *
843 * t/255 ~= (t + (t >> 8)) >> 8
844 *
845 * note that just by itself it doesn't satisfies the OpenGL criteria, as
846 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
847 * must be used.
848 *
849 * - geometric series plus rounding
850 *
851 * when using a geometric series division instead of truncating the result
852 * use roundoff in the approximation (Jim Blinn)
853 *
854 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
855 *
856 * achieving the exact results.
857 *
858 *
859 *
860 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
861 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
862 * @sa Michael Herf, The "double blend trick", May 2000,
863 * http://www.stereopsis.com/doubleblend.html
864 */
865 static LLVMValueRef
866 lp_build_mul_norm(struct gallivm_state *gallivm,
867 struct lp_type wide_type,
868 LLVMValueRef a, LLVMValueRef b)
869 {
870 LLVMBuilderRef builder = gallivm->builder;
871 struct lp_build_context bld;
872 unsigned n;
873 LLVMValueRef half;
874 LLVMValueRef ab;
875
876 assert(!wide_type.floating);
877 assert(lp_check_value(wide_type, a));
878 assert(lp_check_value(wide_type, b));
879
880 lp_build_context_init(&bld, gallivm, wide_type);
881
882 n = wide_type.width / 2;
883 if (wide_type.sign) {
884 --n;
885 }
886
887 /*
888 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
889 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
890 */
891
892 /*
893 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
894 */
895
896 ab = LLVMBuildMul(builder, a, b, "");
897 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
898
899 /*
900 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
901 */
902
903 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
904 if (wide_type.sign) {
905 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
906 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
907 half = lp_build_select(&bld, sign, minus_half, half);
908 }
909 ab = LLVMBuildAdd(builder, ab, half, "");
910
911 /* Final division */
912 ab = lp_build_shr_imm(&bld, ab, n);
913
914 return ab;
915 }
916
917 /**
918 * Generate a * b
919 */
920 LLVMValueRef
921 lp_build_mul(struct lp_build_context *bld,
922 LLVMValueRef a,
923 LLVMValueRef b)
924 {
925 LLVMBuilderRef builder = bld->gallivm->builder;
926 const struct lp_type type = bld->type;
927 LLVMValueRef shift;
928 LLVMValueRef res;
929
930 assert(lp_check_value(type, a));
931 assert(lp_check_value(type, b));
932
933 if(a == bld->zero)
934 return bld->zero;
935 if(a == bld->one)
936 return b;
937 if(b == bld->zero)
938 return bld->zero;
939 if(b == bld->one)
940 return a;
941 if(a == bld->undef || b == bld->undef)
942 return bld->undef;
943
944 if (!type.floating && !type.fixed && type.norm) {
945 struct lp_type wide_type = lp_wider_type(type);
946 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
947
948 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
949 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
950
951 /* PMULLW, PSRLW, PADDW */
952 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
953 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
954
955 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
956
957 return ab;
958 }
959
960 if(type.fixed)
961 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
962 else
963 shift = NULL;
964
965 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
966 if (type.floating)
967 res = LLVMConstFMul(a, b);
968 else
969 res = LLVMConstMul(a, b);
970 if(shift) {
971 if(type.sign)
972 res = LLVMConstAShr(res, shift);
973 else
974 res = LLVMConstLShr(res, shift);
975 }
976 }
977 else {
978 if (type.floating)
979 res = LLVMBuildFMul(builder, a, b, "");
980 else
981 res = LLVMBuildMul(builder, a, b, "");
982 if(shift) {
983 if(type.sign)
984 res = LLVMBuildAShr(builder, res, shift, "");
985 else
986 res = LLVMBuildLShr(builder, res, shift, "");
987 }
988 }
989
990 return res;
991 }
992
993
994 /**
995 * Small vector x scale multiplication optimization.
996 */
997 LLVMValueRef
998 lp_build_mul_imm(struct lp_build_context *bld,
999 LLVMValueRef a,
1000 int b)
1001 {
1002 LLVMBuilderRef builder = bld->gallivm->builder;
1003 LLVMValueRef factor;
1004
1005 assert(lp_check_value(bld->type, a));
1006
1007 if(b == 0)
1008 return bld->zero;
1009
1010 if(b == 1)
1011 return a;
1012
1013 if(b == -1)
1014 return lp_build_negate(bld, a);
1015
1016 if(b == 2 && bld->type.floating)
1017 return lp_build_add(bld, a, a);
1018
1019 if(util_is_power_of_two(b)) {
1020 unsigned shift = ffs(b) - 1;
1021
1022 if(bld->type.floating) {
1023 #if 0
1024 /*
1025 * Power of two multiplication by directly manipulating the exponent.
1026 *
1027 * XXX: This might not be always faster, it will introduce a small error
1028 * for multiplication by zero, and it will produce wrong results
1029 * for Inf and NaN.
1030 */
1031 unsigned mantissa = lp_mantissa(bld->type);
1032 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1033 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1034 a = LLVMBuildAdd(builder, a, factor, "");
1035 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1036 return a;
1037 #endif
1038 }
1039 else {
1040 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1041 return LLVMBuildShl(builder, a, factor, "");
1042 }
1043 }
1044
1045 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1046 return lp_build_mul(bld, a, factor);
1047 }
1048
1049
1050 /**
1051 * Generate a / b
1052 */
1053 LLVMValueRef
1054 lp_build_div(struct lp_build_context *bld,
1055 LLVMValueRef a,
1056 LLVMValueRef b)
1057 {
1058 LLVMBuilderRef builder = bld->gallivm->builder;
1059 const struct lp_type type = bld->type;
1060
1061 assert(lp_check_value(type, a));
1062 assert(lp_check_value(type, b));
1063
1064 if(a == bld->zero)
1065 return bld->zero;
1066 if(a == bld->one)
1067 return lp_build_rcp(bld, b);
1068 if(b == bld->zero)
1069 return bld->undef;
1070 if(b == bld->one)
1071 return a;
1072 if(a == bld->undef || b == bld->undef)
1073 return bld->undef;
1074
1075 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1076 if (type.floating)
1077 return LLVMConstFDiv(a, b);
1078 else if (type.sign)
1079 return LLVMConstSDiv(a, b);
1080 else
1081 return LLVMConstUDiv(a, b);
1082 }
1083
1084 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1085 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1086 type.floating)
1087 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1088
1089 if (type.floating)
1090 return LLVMBuildFDiv(builder, a, b, "");
1091 else if (type.sign)
1092 return LLVMBuildSDiv(builder, a, b, "");
1093 else
1094 return LLVMBuildUDiv(builder, a, b, "");
1095 }
1096
1097
1098 /**
1099 * Linear interpolation helper.
1100 *
1101 * @param normalized whether we are interpolating normalized values,
1102 * encoded in normalized integers, twice as wide.
1103 *
1104 * @sa http://www.stereopsis.com/doubleblend.html
1105 */
1106 static INLINE LLVMValueRef
1107 lp_build_lerp_simple(struct lp_build_context *bld,
1108 LLVMValueRef x,
1109 LLVMValueRef v0,
1110 LLVMValueRef v1,
1111 unsigned flags)
1112 {
1113 unsigned half_width = bld->type.width/2;
1114 LLVMBuilderRef builder = bld->gallivm->builder;
1115 LLVMValueRef delta;
1116 LLVMValueRef res;
1117
1118 assert(lp_check_value(bld->type, x));
1119 assert(lp_check_value(bld->type, v0));
1120 assert(lp_check_value(bld->type, v1));
1121
1122 delta = lp_build_sub(bld, v1, v0);
1123
1124 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1125 if (!bld->type.sign) {
1126 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1127 /*
1128 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1129 * most-significant-bit to the lowest-significant-bit, so that
1130 * later we can just divide by 2**n instead of 2**n - 1.
1131 */
1132
1133 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1134 }
1135
1136 /* (x * delta) >> n */
1137 res = lp_build_mul(bld, x, delta);
1138 res = lp_build_shr_imm(bld, res, half_width);
1139 } else {
1140 /*
1141 * The rescaling trick above doesn't work for signed numbers, so
1142 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1143 * instead.
1144 */
1145 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1146 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1147 }
1148 } else {
1149 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1150 res = lp_build_mul(bld, x, delta);
1151 }
1152
1153 res = lp_build_add(bld, v0, res);
1154
1155 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1156 bld->type.fixed) {
1157 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1158 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1159 * but it will be wrong for true fixed point use cases. Basically we need
1160 * a more powerful lp_type, capable of further distinguishing the values
1161 * interpretation from the value storage. */
1162 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1163 }
1164
1165 return res;
1166 }
1167
1168
1169 /**
1170 * Linear interpolation.
1171 */
1172 LLVMValueRef
1173 lp_build_lerp(struct lp_build_context *bld,
1174 LLVMValueRef x,
1175 LLVMValueRef v0,
1176 LLVMValueRef v1,
1177 unsigned flags)
1178 {
1179 const struct lp_type type = bld->type;
1180 LLVMValueRef res;
1181
1182 assert(lp_check_value(type, x));
1183 assert(lp_check_value(type, v0));
1184 assert(lp_check_value(type, v1));
1185
1186 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1187
1188 if (type.norm) {
1189 struct lp_type wide_type;
1190 struct lp_build_context wide_bld;
1191 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1192
1193 assert(type.length >= 2);
1194
1195 /*
1196 * Create a wider integer type, enough to hold the
1197 * intermediate result of the multiplication.
1198 */
1199 memset(&wide_type, 0, sizeof wide_type);
1200 wide_type.sign = type.sign;
1201 wide_type.width = type.width*2;
1202 wide_type.length = type.length/2;
1203
1204 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1205
1206 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1207 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1208 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1209
1210 /*
1211 * Lerp both halves.
1212 */
1213
1214 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1215
1216 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1217 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1218
1219 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1220 } else {
1221 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1222 }
1223
1224 return res;
1225 }
1226
1227
1228 /**
1229 * Bilinear interpolation.
1230 *
1231 * Values indices are in v_{yx}.
1232 */
1233 LLVMValueRef
1234 lp_build_lerp_2d(struct lp_build_context *bld,
1235 LLVMValueRef x,
1236 LLVMValueRef y,
1237 LLVMValueRef v00,
1238 LLVMValueRef v01,
1239 LLVMValueRef v10,
1240 LLVMValueRef v11,
1241 unsigned flags)
1242 {
1243 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1244 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1245 return lp_build_lerp(bld, y, v0, v1, flags);
1246 }
1247
1248
1249 LLVMValueRef
1250 lp_build_lerp_3d(struct lp_build_context *bld,
1251 LLVMValueRef x,
1252 LLVMValueRef y,
1253 LLVMValueRef z,
1254 LLVMValueRef v000,
1255 LLVMValueRef v001,
1256 LLVMValueRef v010,
1257 LLVMValueRef v011,
1258 LLVMValueRef v100,
1259 LLVMValueRef v101,
1260 LLVMValueRef v110,
1261 LLVMValueRef v111,
1262 unsigned flags)
1263 {
1264 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1265 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1266 return lp_build_lerp(bld, z, v0, v1, flags);
1267 }
1268
1269
1270 /**
1271 * Generate min(a, b)
1272 * Do checks for special cases but not for nans.
1273 */
1274 LLVMValueRef
1275 lp_build_min(struct lp_build_context *bld,
1276 LLVMValueRef a,
1277 LLVMValueRef b)
1278 {
1279 assert(lp_check_value(bld->type, a));
1280 assert(lp_check_value(bld->type, b));
1281
1282 if(a == bld->undef || b == bld->undef)
1283 return bld->undef;
1284
1285 if(a == b)
1286 return a;
1287
1288 if (bld->type.norm) {
1289 if (!bld->type.sign) {
1290 if (a == bld->zero || b == bld->zero) {
1291 return bld->zero;
1292 }
1293 }
1294 if(a == bld->one)
1295 return b;
1296 if(b == bld->one)
1297 return a;
1298 }
1299
1300 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1301 }
1302
1303
1304 /**
1305 * Generate min(a, b)
1306 * NaN's are handled according to the behavior specified by the
1307 * nan_behavior argument.
1308 */
1309 LLVMValueRef
1310 lp_build_min_ext(struct lp_build_context *bld,
1311 LLVMValueRef a,
1312 LLVMValueRef b,
1313 enum gallivm_nan_behavior nan_behavior)
1314 {
1315 assert(lp_check_value(bld->type, a));
1316 assert(lp_check_value(bld->type, b));
1317
1318 if(a == bld->undef || b == bld->undef)
1319 return bld->undef;
1320
1321 if(a == b)
1322 return a;
1323
1324 if (bld->type.norm) {
1325 if (!bld->type.sign) {
1326 if (a == bld->zero || b == bld->zero) {
1327 return bld->zero;
1328 }
1329 }
1330 if(a == bld->one)
1331 return b;
1332 if(b == bld->one)
1333 return a;
1334 }
1335
1336 return lp_build_min_simple(bld, a, b, nan_behavior);
1337 }
1338
1339 /**
1340 * Generate max(a, b)
1341 * Do checks for special cases, but NaN behavior is undefined.
1342 */
1343 LLVMValueRef
1344 lp_build_max(struct lp_build_context *bld,
1345 LLVMValueRef a,
1346 LLVMValueRef b)
1347 {
1348 assert(lp_check_value(bld->type, a));
1349 assert(lp_check_value(bld->type, b));
1350
1351 if(a == bld->undef || b == bld->undef)
1352 return bld->undef;
1353
1354 if(a == b)
1355 return a;
1356
1357 if(bld->type.norm) {
1358 if(a == bld->one || b == bld->one)
1359 return bld->one;
1360 if (!bld->type.sign) {
1361 if (a == bld->zero) {
1362 return b;
1363 }
1364 if (b == bld->zero) {
1365 return a;
1366 }
1367 }
1368 }
1369
1370 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1371 }
1372
1373
1374 /**
1375 * Generate max(a, b)
1376 * Checks for special cases.
1377 * NaN's are handled according to the behavior specified by the
1378 * nan_behavior argument.
1379 */
1380 LLVMValueRef
1381 lp_build_max_ext(struct lp_build_context *bld,
1382 LLVMValueRef a,
1383 LLVMValueRef b,
1384 enum gallivm_nan_behavior nan_behavior)
1385 {
1386 assert(lp_check_value(bld->type, a));
1387 assert(lp_check_value(bld->type, b));
1388
1389 if(a == bld->undef || b == bld->undef)
1390 return bld->undef;
1391
1392 if(a == b)
1393 return a;
1394
1395 if(bld->type.norm) {
1396 if(a == bld->one || b == bld->one)
1397 return bld->one;
1398 if (!bld->type.sign) {
1399 if (a == bld->zero) {
1400 return b;
1401 }
1402 if (b == bld->zero) {
1403 return a;
1404 }
1405 }
1406 }
1407
1408 return lp_build_max_simple(bld, a, b, nan_behavior);
1409 }
1410
1411 /**
1412 * Generate clamp(a, min, max)
1413 * NaN behavior (for any of a, min, max) is undefined.
1414 * Do checks for special cases.
1415 */
1416 LLVMValueRef
1417 lp_build_clamp(struct lp_build_context *bld,
1418 LLVMValueRef a,
1419 LLVMValueRef min,
1420 LLVMValueRef max)
1421 {
1422 assert(lp_check_value(bld->type, a));
1423 assert(lp_check_value(bld->type, min));
1424 assert(lp_check_value(bld->type, max));
1425
1426 a = lp_build_min(bld, a, max);
1427 a = lp_build_max(bld, a, min);
1428 return a;
1429 }
1430
1431
1432 /**
1433 * Generate clamp(a, 0, 1)
1434 * A NaN will get converted to zero.
1435 */
1436 LLVMValueRef
1437 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1438 LLVMValueRef a)
1439 {
1440 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1441 a = lp_build_min(bld, a, bld->one);
1442 return a;
1443 }
1444
1445
1446 /**
1447 * Generate abs(a)
1448 */
1449 LLVMValueRef
1450 lp_build_abs(struct lp_build_context *bld,
1451 LLVMValueRef a)
1452 {
1453 LLVMBuilderRef builder = bld->gallivm->builder;
1454 const struct lp_type type = bld->type;
1455 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1456
1457 assert(lp_check_value(type, a));
1458
1459 if(!type.sign)
1460 return a;
1461
1462 if(type.floating) {
1463 /* Mask out the sign bit */
1464 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1465 unsigned long long absMask = ~(1ULL << (type.width - 1));
1466 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1467 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1468 a = LLVMBuildAnd(builder, a, mask, "");
1469 a = LLVMBuildBitCast(builder, a, vec_type, "");
1470 return a;
1471 }
1472
1473 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1474 switch(type.width) {
1475 case 8:
1476 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1477 case 16:
1478 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1479 case 32:
1480 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1481 }
1482 }
1483 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1484 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1485 (type.width == 8 || type.width == 16 || type.width == 32)) {
1486 debug_printf("%s: inefficient code, should split vectors manually\n",
1487 __FUNCTION__);
1488 }
1489
1490 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1491 }
1492
1493
1494 LLVMValueRef
1495 lp_build_negate(struct lp_build_context *bld,
1496 LLVMValueRef a)
1497 {
1498 LLVMBuilderRef builder = bld->gallivm->builder;
1499
1500 assert(lp_check_value(bld->type, a));
1501
1502 #if HAVE_LLVM >= 0x0207
1503 if (bld->type.floating)
1504 a = LLVMBuildFNeg(builder, a, "");
1505 else
1506 #endif
1507 a = LLVMBuildNeg(builder, a, "");
1508
1509 return a;
1510 }
1511
1512
1513 /** Return -1, 0 or +1 depending on the sign of a */
1514 LLVMValueRef
1515 lp_build_sgn(struct lp_build_context *bld,
1516 LLVMValueRef a)
1517 {
1518 LLVMBuilderRef builder = bld->gallivm->builder;
1519 const struct lp_type type = bld->type;
1520 LLVMValueRef cond;
1521 LLVMValueRef res;
1522
1523 assert(lp_check_value(type, a));
1524
1525 /* Handle non-zero case */
1526 if(!type.sign) {
1527 /* if not zero then sign must be positive */
1528 res = bld->one;
1529 }
1530 else if(type.floating) {
1531 LLVMTypeRef vec_type;
1532 LLVMTypeRef int_type;
1533 LLVMValueRef mask;
1534 LLVMValueRef sign;
1535 LLVMValueRef one;
1536 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1537
1538 int_type = lp_build_int_vec_type(bld->gallivm, type);
1539 vec_type = lp_build_vec_type(bld->gallivm, type);
1540 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1541
1542 /* Take the sign bit and add it to 1 constant */
1543 sign = LLVMBuildBitCast(builder, a, int_type, "");
1544 sign = LLVMBuildAnd(builder, sign, mask, "");
1545 one = LLVMConstBitCast(bld->one, int_type);
1546 res = LLVMBuildOr(builder, sign, one, "");
1547 res = LLVMBuildBitCast(builder, res, vec_type, "");
1548 }
1549 else
1550 {
1551 /* signed int/norm/fixed point */
1552 /* could use psign with sse3 and appropriate vectors here */
1553 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1554 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1555 res = lp_build_select(bld, cond, bld->one, minus_one);
1556 }
1557
1558 /* Handle zero */
1559 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1560 res = lp_build_select(bld, cond, bld->zero, res);
1561
1562 return res;
1563 }
1564
1565
1566 /**
1567 * Set the sign of float vector 'a' according to 'sign'.
1568 * If sign==0, return abs(a).
1569 * If sign==1, return -abs(a);
1570 * Other values for sign produce undefined results.
1571 */
1572 LLVMValueRef
1573 lp_build_set_sign(struct lp_build_context *bld,
1574 LLVMValueRef a, LLVMValueRef sign)
1575 {
1576 LLVMBuilderRef builder = bld->gallivm->builder;
1577 const struct lp_type type = bld->type;
1578 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1579 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1580 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1581 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1582 ~((unsigned long long) 1 << (type.width - 1)));
1583 LLVMValueRef val, res;
1584
1585 assert(type.floating);
1586 assert(lp_check_value(type, a));
1587
1588 /* val = reinterpret_cast<int>(a) */
1589 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1590 /* val = val & mask */
1591 val = LLVMBuildAnd(builder, val, mask, "");
1592 /* sign = sign << shift */
1593 sign = LLVMBuildShl(builder, sign, shift, "");
1594 /* res = val | sign */
1595 res = LLVMBuildOr(builder, val, sign, "");
1596 /* res = reinterpret_cast<float>(res) */
1597 res = LLVMBuildBitCast(builder, res, vec_type, "");
1598
1599 return res;
1600 }
1601
1602
1603 /**
1604 * Convert vector of (or scalar) int to vector of (or scalar) float.
1605 */
1606 LLVMValueRef
1607 lp_build_int_to_float(struct lp_build_context *bld,
1608 LLVMValueRef a)
1609 {
1610 LLVMBuilderRef builder = bld->gallivm->builder;
1611 const struct lp_type type = bld->type;
1612 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1613
1614 assert(type.floating);
1615
1616 return LLVMBuildSIToFP(builder, a, vec_type, "");
1617 }
1618
1619 static boolean
1620 arch_rounding_available(const struct lp_type type)
1621 {
1622 if ((util_cpu_caps.has_sse4_1 &&
1623 (type.length == 1 || type.width*type.length == 128)) ||
1624 (util_cpu_caps.has_avx && type.width*type.length == 256))
1625 return TRUE;
1626 else if ((util_cpu_caps.has_altivec &&
1627 (type.width == 32 && type.length == 4)))
1628 return TRUE;
1629
1630 return FALSE;
1631 }
1632
1633 enum lp_build_round_mode
1634 {
1635 LP_BUILD_ROUND_NEAREST = 0,
1636 LP_BUILD_ROUND_FLOOR = 1,
1637 LP_BUILD_ROUND_CEIL = 2,
1638 LP_BUILD_ROUND_TRUNCATE = 3
1639 };
1640
1641 /**
1642 * Helper for SSE4.1's ROUNDxx instructions.
1643 *
1644 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1645 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1646 */
1647 static INLINE LLVMValueRef
1648 lp_build_round_sse41(struct lp_build_context *bld,
1649 LLVMValueRef a,
1650 enum lp_build_round_mode mode)
1651 {
1652 LLVMBuilderRef builder = bld->gallivm->builder;
1653 const struct lp_type type = bld->type;
1654 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1655 const char *intrinsic;
1656 LLVMValueRef res;
1657
1658 assert(type.floating);
1659
1660 assert(lp_check_value(type, a));
1661 assert(util_cpu_caps.has_sse4_1);
1662
1663 if (type.length == 1) {
1664 LLVMTypeRef vec_type;
1665 LLVMValueRef undef;
1666 LLVMValueRef args[3];
1667 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1668
1669 switch(type.width) {
1670 case 32:
1671 intrinsic = "llvm.x86.sse41.round.ss";
1672 break;
1673 case 64:
1674 intrinsic = "llvm.x86.sse41.round.sd";
1675 break;
1676 default:
1677 assert(0);
1678 return bld->undef;
1679 }
1680
1681 vec_type = LLVMVectorType(bld->elem_type, 4);
1682
1683 undef = LLVMGetUndef(vec_type);
1684
1685 args[0] = undef;
1686 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1687 args[2] = LLVMConstInt(i32t, mode, 0);
1688
1689 res = lp_build_intrinsic(builder, intrinsic,
1690 vec_type, args, Elements(args));
1691
1692 res = LLVMBuildExtractElement(builder, res, index0, "");
1693 }
1694 else {
1695 if (type.width * type.length == 128) {
1696 switch(type.width) {
1697 case 32:
1698 intrinsic = "llvm.x86.sse41.round.ps";
1699 break;
1700 case 64:
1701 intrinsic = "llvm.x86.sse41.round.pd";
1702 break;
1703 default:
1704 assert(0);
1705 return bld->undef;
1706 }
1707 }
1708 else {
1709 assert(type.width * type.length == 256);
1710 assert(util_cpu_caps.has_avx);
1711
1712 switch(type.width) {
1713 case 32:
1714 intrinsic = "llvm.x86.avx.round.ps.256";
1715 break;
1716 case 64:
1717 intrinsic = "llvm.x86.avx.round.pd.256";
1718 break;
1719 default:
1720 assert(0);
1721 return bld->undef;
1722 }
1723 }
1724
1725 res = lp_build_intrinsic_binary(builder, intrinsic,
1726 bld->vec_type, a,
1727 LLVMConstInt(i32t, mode, 0));
1728 }
1729
1730 return res;
1731 }
1732
1733
1734 static INLINE LLVMValueRef
1735 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1736 LLVMValueRef a)
1737 {
1738 LLVMBuilderRef builder = bld->gallivm->builder;
1739 const struct lp_type type = bld->type;
1740 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1741 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1742 const char *intrinsic;
1743 LLVMValueRef res;
1744
1745 assert(type.floating);
1746 /* using the double precision conversions is a bit more complicated */
1747 assert(type.width == 32);
1748
1749 assert(lp_check_value(type, a));
1750 assert(util_cpu_caps.has_sse2);
1751
1752 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1753 if (type.length == 1) {
1754 LLVMTypeRef vec_type;
1755 LLVMValueRef undef;
1756 LLVMValueRef arg;
1757 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1758
1759 vec_type = LLVMVectorType(bld->elem_type, 4);
1760
1761 intrinsic = "llvm.x86.sse.cvtss2si";
1762
1763 undef = LLVMGetUndef(vec_type);
1764
1765 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1766
1767 res = lp_build_intrinsic_unary(builder, intrinsic,
1768 ret_type, arg);
1769 }
1770 else {
1771 if (type.width* type.length == 128) {
1772 intrinsic = "llvm.x86.sse2.cvtps2dq";
1773 }
1774 else {
1775 assert(type.width*type.length == 256);
1776 assert(util_cpu_caps.has_avx);
1777
1778 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1779 }
1780 res = lp_build_intrinsic_unary(builder, intrinsic,
1781 ret_type, a);
1782 }
1783
1784 return res;
1785 }
1786
1787
1788 /*
1789 */
1790 static INLINE LLVMValueRef
1791 lp_build_round_altivec(struct lp_build_context *bld,
1792 LLVMValueRef a,
1793 enum lp_build_round_mode mode)
1794 {
1795 LLVMBuilderRef builder = bld->gallivm->builder;
1796 const struct lp_type type = bld->type;
1797 const char *intrinsic = NULL;
1798
1799 assert(type.floating);
1800
1801 assert(lp_check_value(type, a));
1802 assert(util_cpu_caps.has_altivec);
1803
1804 switch (mode) {
1805 case LP_BUILD_ROUND_NEAREST:
1806 intrinsic = "llvm.ppc.altivec.vrfin";
1807 break;
1808 case LP_BUILD_ROUND_FLOOR:
1809 intrinsic = "llvm.ppc.altivec.vrfim";
1810 break;
1811 case LP_BUILD_ROUND_CEIL:
1812 intrinsic = "llvm.ppc.altivec.vrfip";
1813 break;
1814 case LP_BUILD_ROUND_TRUNCATE:
1815 intrinsic = "llvm.ppc.altivec.vrfiz";
1816 break;
1817 }
1818
1819 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1820 }
1821
1822 static INLINE LLVMValueRef
1823 lp_build_round_arch(struct lp_build_context *bld,
1824 LLVMValueRef a,
1825 enum lp_build_round_mode mode)
1826 {
1827 if (util_cpu_caps.has_sse4_1)
1828 return lp_build_round_sse41(bld, a, mode);
1829 else /* (util_cpu_caps.has_altivec) */
1830 return lp_build_round_altivec(bld, a, mode);
1831 }
1832
1833 /**
1834 * Return the integer part of a float (vector) value (== round toward zero).
1835 * The returned value is a float (vector).
1836 * Ex: trunc(-1.5) = -1.0
1837 */
1838 LLVMValueRef
1839 lp_build_trunc(struct lp_build_context *bld,
1840 LLVMValueRef a)
1841 {
1842 LLVMBuilderRef builder = bld->gallivm->builder;
1843 const struct lp_type type = bld->type;
1844
1845 assert(type.floating);
1846 assert(lp_check_value(type, a));
1847
1848 if (arch_rounding_available(type)) {
1849 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1850 }
1851 else {
1852 const struct lp_type type = bld->type;
1853 struct lp_type inttype;
1854 struct lp_build_context intbld;
1855 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1856 LLVMValueRef trunc, res, anosign, mask;
1857 LLVMTypeRef int_vec_type = bld->int_vec_type;
1858 LLVMTypeRef vec_type = bld->vec_type;
1859
1860 assert(type.width == 32); /* might want to handle doubles at some point */
1861
1862 inttype = type;
1863 inttype.floating = 0;
1864 lp_build_context_init(&intbld, bld->gallivm, inttype);
1865
1866 /* round by truncation */
1867 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1868 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1869
1870 /* mask out sign bit */
1871 anosign = lp_build_abs(bld, a);
1872 /*
1873 * mask out all values if anosign > 2^24
1874 * This should work both for large ints (all rounding is no-op for them
1875 * because such floats are always exact) as well as special cases like
1876 * NaNs, Infs (taking advantage of the fact they use max exponent).
1877 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1878 */
1879 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1880 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1881 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1882 return lp_build_select(bld, mask, a, res);
1883 }
1884 }
1885
1886
1887 /**
1888 * Return float (vector) rounded to nearest integer (vector). The returned
1889 * value is a float (vector).
1890 * Ex: round(0.9) = 1.0
1891 * Ex: round(-1.5) = -2.0
1892 */
1893 LLVMValueRef
1894 lp_build_round(struct lp_build_context *bld,
1895 LLVMValueRef a)
1896 {
1897 LLVMBuilderRef builder = bld->gallivm->builder;
1898 const struct lp_type type = bld->type;
1899
1900 assert(type.floating);
1901 assert(lp_check_value(type, a));
1902
1903 if (arch_rounding_available(type)) {
1904 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1905 }
1906 else {
1907 const struct lp_type type = bld->type;
1908 struct lp_type inttype;
1909 struct lp_build_context intbld;
1910 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1911 LLVMValueRef res, anosign, mask;
1912 LLVMTypeRef int_vec_type = bld->int_vec_type;
1913 LLVMTypeRef vec_type = bld->vec_type;
1914
1915 assert(type.width == 32); /* might want to handle doubles at some point */
1916
1917 inttype = type;
1918 inttype.floating = 0;
1919 lp_build_context_init(&intbld, bld->gallivm, inttype);
1920
1921 res = lp_build_iround(bld, a);
1922 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1923
1924 /* mask out sign bit */
1925 anosign = lp_build_abs(bld, a);
1926 /*
1927 * mask out all values if anosign > 2^24
1928 * This should work both for large ints (all rounding is no-op for them
1929 * because such floats are always exact) as well as special cases like
1930 * NaNs, Infs (taking advantage of the fact they use max exponent).
1931 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1932 */
1933 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1934 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1935 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1936 return lp_build_select(bld, mask, a, res);
1937 }
1938 }
1939
1940
1941 /**
1942 * Return floor of float (vector), result is a float (vector)
1943 * Ex: floor(1.1) = 1.0
1944 * Ex: floor(-1.1) = -2.0
1945 */
1946 LLVMValueRef
1947 lp_build_floor(struct lp_build_context *bld,
1948 LLVMValueRef a)
1949 {
1950 LLVMBuilderRef builder = bld->gallivm->builder;
1951 const struct lp_type type = bld->type;
1952
1953 assert(type.floating);
1954 assert(lp_check_value(type, a));
1955
1956 if (arch_rounding_available(type)) {
1957 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1958 }
1959 else {
1960 const struct lp_type type = bld->type;
1961 struct lp_type inttype;
1962 struct lp_build_context intbld;
1963 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1964 LLVMValueRef trunc, res, anosign, mask;
1965 LLVMTypeRef int_vec_type = bld->int_vec_type;
1966 LLVMTypeRef vec_type = bld->vec_type;
1967
1968 assert(type.width == 32); /* might want to handle doubles at some point */
1969
1970 inttype = type;
1971 inttype.floating = 0;
1972 lp_build_context_init(&intbld, bld->gallivm, inttype);
1973
1974 /* round by truncation */
1975 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1976 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1977
1978 if (type.sign) {
1979 LLVMValueRef tmp;
1980
1981 /*
1982 * fix values if rounding is wrong (for non-special cases)
1983 * - this is the case if trunc > a
1984 */
1985 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1986 /* tmp = trunc > a ? 1.0 : 0.0 */
1987 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1988 tmp = lp_build_and(&intbld, mask, tmp);
1989 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1990 res = lp_build_sub(bld, res, tmp);
1991 }
1992
1993 /* mask out sign bit */
1994 anosign = lp_build_abs(bld, a);
1995 /*
1996 * mask out all values if anosign > 2^24
1997 * This should work both for large ints (all rounding is no-op for them
1998 * because such floats are always exact) as well as special cases like
1999 * NaNs, Infs (taking advantage of the fact they use max exponent).
2000 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2001 */
2002 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2003 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2004 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2005 return lp_build_select(bld, mask, a, res);
2006 }
2007 }
2008
2009
2010 /**
2011 * Return ceiling of float (vector), returning float (vector).
2012 * Ex: ceil( 1.1) = 2.0
2013 * Ex: ceil(-1.1) = -1.0
2014 */
2015 LLVMValueRef
2016 lp_build_ceil(struct lp_build_context *bld,
2017 LLVMValueRef a)
2018 {
2019 LLVMBuilderRef builder = bld->gallivm->builder;
2020 const struct lp_type type = bld->type;
2021
2022 assert(type.floating);
2023 assert(lp_check_value(type, a));
2024
2025 if (arch_rounding_available(type)) {
2026 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2027 }
2028 else {
2029 const struct lp_type type = bld->type;
2030 struct lp_type inttype;
2031 struct lp_build_context intbld;
2032 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2033 LLVMValueRef trunc, res, anosign, mask, tmp;
2034 LLVMTypeRef int_vec_type = bld->int_vec_type;
2035 LLVMTypeRef vec_type = bld->vec_type;
2036
2037 assert(type.width == 32); /* might want to handle doubles at some point */
2038
2039 inttype = type;
2040 inttype.floating = 0;
2041 lp_build_context_init(&intbld, bld->gallivm, inttype);
2042
2043 /* round by truncation */
2044 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2045 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2046
2047 /*
2048 * fix values if rounding is wrong (for non-special cases)
2049 * - this is the case if trunc < a
2050 */
2051 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2052 /* tmp = trunc < a ? 1.0 : 0.0 */
2053 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2054 tmp = lp_build_and(&intbld, mask, tmp);
2055 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2056 res = lp_build_add(bld, trunc, tmp);
2057
2058 /* mask out sign bit */
2059 anosign = lp_build_abs(bld, a);
2060 /*
2061 * mask out all values if anosign > 2^24
2062 * This should work both for large ints (all rounding is no-op for them
2063 * because such floats are always exact) as well as special cases like
2064 * NaNs, Infs (taking advantage of the fact they use max exponent).
2065 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2066 */
2067 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2068 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2069 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2070 return lp_build_select(bld, mask, a, res);
2071 }
2072 }
2073
2074
2075 /**
2076 * Return fractional part of 'a' computed as a - floor(a)
2077 * Typically used in texture coord arithmetic.
2078 */
2079 LLVMValueRef
2080 lp_build_fract(struct lp_build_context *bld,
2081 LLVMValueRef a)
2082 {
2083 assert(bld->type.floating);
2084 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2085 }
2086
2087
2088 /**
2089 * Prevent returning a fractional part of 1.0 for very small negative values of
2090 * 'a' by clamping against 0.99999(9).
2091 */
2092 static inline LLVMValueRef
2093 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2094 {
2095 LLVMValueRef max;
2096
2097 /* this is the largest number smaller than 1.0 representable as float */
2098 max = lp_build_const_vec(bld->gallivm, bld->type,
2099 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2100 return lp_build_min(bld, fract, max);
2101 }
2102
2103
2104 /**
2105 * Same as lp_build_fract, but guarantees that the result is always smaller
2106 * than one.
2107 */
2108 LLVMValueRef
2109 lp_build_fract_safe(struct lp_build_context *bld,
2110 LLVMValueRef a)
2111 {
2112 return clamp_fract(bld, lp_build_fract(bld, a));
2113 }
2114
2115
2116 /**
2117 * Return the integer part of a float (vector) value (== round toward zero).
2118 * The returned value is an integer (vector).
2119 * Ex: itrunc(-1.5) = -1
2120 */
2121 LLVMValueRef
2122 lp_build_itrunc(struct lp_build_context *bld,
2123 LLVMValueRef a)
2124 {
2125 LLVMBuilderRef builder = bld->gallivm->builder;
2126 const struct lp_type type = bld->type;
2127 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2128
2129 assert(type.floating);
2130 assert(lp_check_value(type, a));
2131
2132 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2133 }
2134
2135
2136 /**
2137 * Return float (vector) rounded to nearest integer (vector). The returned
2138 * value is an integer (vector).
2139 * Ex: iround(0.9) = 1
2140 * Ex: iround(-1.5) = -2
2141 */
2142 LLVMValueRef
2143 lp_build_iround(struct lp_build_context *bld,
2144 LLVMValueRef a)
2145 {
2146 LLVMBuilderRef builder = bld->gallivm->builder;
2147 const struct lp_type type = bld->type;
2148 LLVMTypeRef int_vec_type = bld->int_vec_type;
2149 LLVMValueRef res;
2150
2151 assert(type.floating);
2152
2153 assert(lp_check_value(type, a));
2154
2155 if ((util_cpu_caps.has_sse2 &&
2156 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2157 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2158 return lp_build_iround_nearest_sse2(bld, a);
2159 }
2160 if (arch_rounding_available(type)) {
2161 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2162 }
2163 else {
2164 LLVMValueRef half;
2165
2166 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2167
2168 if (type.sign) {
2169 LLVMTypeRef vec_type = bld->vec_type;
2170 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2171 (unsigned long long)1 << (type.width - 1));
2172 LLVMValueRef sign;
2173
2174 /* get sign bit */
2175 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2176 sign = LLVMBuildAnd(builder, sign, mask, "");
2177
2178 /* sign * 0.5 */
2179 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2180 half = LLVMBuildOr(builder, sign, half, "");
2181 half = LLVMBuildBitCast(builder, half, vec_type, "");
2182 }
2183
2184 res = LLVMBuildFAdd(builder, a, half, "");
2185 }
2186
2187 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2188
2189 return res;
2190 }
2191
2192
2193 /**
2194 * Return floor of float (vector), result is an int (vector)
2195 * Ex: ifloor(1.1) = 1.0
2196 * Ex: ifloor(-1.1) = -2.0
2197 */
2198 LLVMValueRef
2199 lp_build_ifloor(struct lp_build_context *bld,
2200 LLVMValueRef a)
2201 {
2202 LLVMBuilderRef builder = bld->gallivm->builder;
2203 const struct lp_type type = bld->type;
2204 LLVMTypeRef int_vec_type = bld->int_vec_type;
2205 LLVMValueRef res;
2206
2207 assert(type.floating);
2208 assert(lp_check_value(type, a));
2209
2210 res = a;
2211 if (type.sign) {
2212 if (arch_rounding_available(type)) {
2213 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2214 }
2215 else {
2216 struct lp_type inttype;
2217 struct lp_build_context intbld;
2218 LLVMValueRef trunc, itrunc, mask;
2219
2220 assert(type.floating);
2221 assert(lp_check_value(type, a));
2222
2223 inttype = type;
2224 inttype.floating = 0;
2225 lp_build_context_init(&intbld, bld->gallivm, inttype);
2226
2227 /* round by truncation */
2228 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2229 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2230
2231 /*
2232 * fix values if rounding is wrong (for non-special cases)
2233 * - this is the case if trunc > a
2234 * The results of doing this with NaNs, very large values etc.
2235 * are undefined but this seems to be the case anyway.
2236 */
2237 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2238 /* cheapie minus one with mask since the mask is minus one / zero */
2239 return lp_build_add(&intbld, itrunc, mask);
2240 }
2241 }
2242
2243 /* round to nearest (toward zero) */
2244 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2245
2246 return res;
2247 }
2248
2249
2250 /**
2251 * Return ceiling of float (vector), returning int (vector).
2252 * Ex: iceil( 1.1) = 2
2253 * Ex: iceil(-1.1) = -1
2254 */
2255 LLVMValueRef
2256 lp_build_iceil(struct lp_build_context *bld,
2257 LLVMValueRef a)
2258 {
2259 LLVMBuilderRef builder = bld->gallivm->builder;
2260 const struct lp_type type = bld->type;
2261 LLVMTypeRef int_vec_type = bld->int_vec_type;
2262 LLVMValueRef res;
2263
2264 assert(type.floating);
2265 assert(lp_check_value(type, a));
2266
2267 if (arch_rounding_available(type)) {
2268 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2269 }
2270 else {
2271 struct lp_type inttype;
2272 struct lp_build_context intbld;
2273 LLVMValueRef trunc, itrunc, mask;
2274
2275 assert(type.floating);
2276 assert(lp_check_value(type, a));
2277
2278 inttype = type;
2279 inttype.floating = 0;
2280 lp_build_context_init(&intbld, bld->gallivm, inttype);
2281
2282 /* round by truncation */
2283 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2284 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2285
2286 /*
2287 * fix values if rounding is wrong (for non-special cases)
2288 * - this is the case if trunc < a
2289 * The results of doing this with NaNs, very large values etc.
2290 * are undefined but this seems to be the case anyway.
2291 */
2292 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2293 /* cheapie plus one with mask since the mask is minus one / zero */
2294 return lp_build_sub(&intbld, itrunc, mask);
2295 }
2296
2297 /* round to nearest (toward zero) */
2298 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2299
2300 return res;
2301 }
2302
2303
2304 /**
2305 * Combined ifloor() & fract().
2306 *
2307 * Preferred to calling the functions separately, as it will ensure that the
2308 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2309 */
2310 void
2311 lp_build_ifloor_fract(struct lp_build_context *bld,
2312 LLVMValueRef a,
2313 LLVMValueRef *out_ipart,
2314 LLVMValueRef *out_fpart)
2315 {
2316 LLVMBuilderRef builder = bld->gallivm->builder;
2317 const struct lp_type type = bld->type;
2318 LLVMValueRef ipart;
2319
2320 assert(type.floating);
2321 assert(lp_check_value(type, a));
2322
2323 if (arch_rounding_available(type)) {
2324 /*
2325 * floor() is easier.
2326 */
2327
2328 ipart = lp_build_floor(bld, a);
2329 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2330 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2331 }
2332 else {
2333 /*
2334 * ifloor() is easier.
2335 */
2336
2337 *out_ipart = lp_build_ifloor(bld, a);
2338 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2339 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2340 }
2341 }
2342
2343
2344 /**
2345 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2346 * always smaller than one.
2347 */
2348 void
2349 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2350 LLVMValueRef a,
2351 LLVMValueRef *out_ipart,
2352 LLVMValueRef *out_fpart)
2353 {
2354 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2355 *out_fpart = clamp_fract(bld, *out_fpart);
2356 }
2357
2358
2359 LLVMValueRef
2360 lp_build_sqrt(struct lp_build_context *bld,
2361 LLVMValueRef a)
2362 {
2363 LLVMBuilderRef builder = bld->gallivm->builder;
2364 const struct lp_type type = bld->type;
2365 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2366 char intrinsic[32];
2367
2368 assert(lp_check_value(type, a));
2369
2370 /* TODO: optimize the constant case */
2371
2372 assert(type.floating);
2373 if (type.length == 1) {
2374 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2375 }
2376 else {
2377 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2378 }
2379
2380 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2381 }
2382
2383
2384 /**
2385 * Do one Newton-Raphson step to improve reciprocate precision:
2386 *
2387 * x_{i+1} = x_i * (2 - a * x_i)
2388 *
2389 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2390 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2391 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2392 * halo. It would be necessary to clamp the argument to prevent this.
2393 *
2394 * See also:
2395 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2396 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2397 */
2398 static INLINE LLVMValueRef
2399 lp_build_rcp_refine(struct lp_build_context *bld,
2400 LLVMValueRef a,
2401 LLVMValueRef rcp_a)
2402 {
2403 LLVMBuilderRef builder = bld->gallivm->builder;
2404 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2405 LLVMValueRef res;
2406
2407 res = LLVMBuildFMul(builder, a, rcp_a, "");
2408 res = LLVMBuildFSub(builder, two, res, "");
2409 res = LLVMBuildFMul(builder, rcp_a, res, "");
2410
2411 return res;
2412 }
2413
2414
2415 LLVMValueRef
2416 lp_build_rcp(struct lp_build_context *bld,
2417 LLVMValueRef a)
2418 {
2419 LLVMBuilderRef builder = bld->gallivm->builder;
2420 const struct lp_type type = bld->type;
2421
2422 assert(lp_check_value(type, a));
2423
2424 if(a == bld->zero)
2425 return bld->undef;
2426 if(a == bld->one)
2427 return bld->one;
2428 if(a == bld->undef)
2429 return bld->undef;
2430
2431 assert(type.floating);
2432
2433 if(LLVMIsConstant(a))
2434 return LLVMConstFDiv(bld->one, a);
2435
2436 /*
2437 * We don't use RCPPS because:
2438 * - it only has 10bits of precision
2439 * - it doesn't even get the reciprocate of 1.0 exactly
2440 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2441 * - for recent processors the benefit over DIVPS is marginal, a case
2442 * dependent
2443 *
2444 * We could still use it on certain processors if benchmarks show that the
2445 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2446 * particular uses that require less workarounds.
2447 */
2448
2449 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2450 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2451 const unsigned num_iterations = 0;
2452 LLVMValueRef res;
2453 unsigned i;
2454 const char *intrinsic = NULL;
2455
2456 if (type.length == 4) {
2457 intrinsic = "llvm.x86.sse.rcp.ps";
2458 }
2459 else {
2460 intrinsic = "llvm.x86.avx.rcp.ps.256";
2461 }
2462
2463 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2464
2465 for (i = 0; i < num_iterations; ++i) {
2466 res = lp_build_rcp_refine(bld, a, res);
2467 }
2468
2469 return res;
2470 }
2471
2472 return LLVMBuildFDiv(builder, bld->one, a, "");
2473 }
2474
2475
2476 /**
2477 * Do one Newton-Raphson step to improve rsqrt precision:
2478 *
2479 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2480 *
2481 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2482 */
2483 static INLINE LLVMValueRef
2484 lp_build_rsqrt_refine(struct lp_build_context *bld,
2485 LLVMValueRef a,
2486 LLVMValueRef rsqrt_a)
2487 {
2488 LLVMBuilderRef builder = bld->gallivm->builder;
2489 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2490 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2491 LLVMValueRef res;
2492
2493 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2494 res = LLVMBuildFMul(builder, a, res, "");
2495 res = LLVMBuildFSub(builder, three, res, "");
2496 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2497 res = LLVMBuildFMul(builder, half, res, "");
2498
2499 return res;
2500 }
2501
2502
2503 /**
2504 * Generate 1/sqrt(a).
2505 * Result is undefined for values < 0, infinity for +0.
2506 */
2507 LLVMValueRef
2508 lp_build_rsqrt(struct lp_build_context *bld,
2509 LLVMValueRef a)
2510 {
2511 LLVMBuilderRef builder = bld->gallivm->builder;
2512 const struct lp_type type = bld->type;
2513
2514 assert(lp_check_value(type, a));
2515
2516 assert(type.floating);
2517
2518 /*
2519 * This should be faster but all denormals will end up as infinity.
2520 */
2521 if (0 && lp_build_fast_rsqrt_available(type)) {
2522 const unsigned num_iterations = 1;
2523 LLVMValueRef res;
2524 unsigned i;
2525
2526 /* rsqrt(1.0) != 1.0 here */
2527 res = lp_build_fast_rsqrt(bld, a);
2528
2529 if (num_iterations) {
2530 /*
2531 * Newton-Raphson will result in NaN instead of infinity for zero,
2532 * and NaN instead of zero for infinity.
2533 * Also, need to ensure rsqrt(1.0) == 1.0.
2534 * All numbers smaller than FLT_MIN will result in +infinity
2535 * (rsqrtps treats all denormals as zero).
2536 */
2537 /*
2538 * Certain non-c99 compilers don't know INFINITY and might not support
2539 * hacks to evaluate it at compile time neither.
2540 */
2541 const unsigned posinf_int = 0x7F800000;
2542 LLVMValueRef cmp;
2543 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2544 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2545
2546 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2547
2548 for (i = 0; i < num_iterations; ++i) {
2549 res = lp_build_rsqrt_refine(bld, a, res);
2550 }
2551 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2552 res = lp_build_select(bld, cmp, inf, res);
2553 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2554 res = lp_build_select(bld, cmp, bld->zero, res);
2555 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2556 res = lp_build_select(bld, cmp, bld->one, res);
2557 }
2558
2559 return res;
2560 }
2561
2562 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2563 }
2564
2565 /**
2566 * If there's a fast (inaccurate) rsqrt instruction available
2567 * (caller may want to avoid to call rsqrt_fast if it's not available,
2568 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2569 * unavailable it would result in sqrt/div/mul so obviously
2570 * much better to just call sqrt, skipping both div and mul).
2571 */
2572 boolean
2573 lp_build_fast_rsqrt_available(struct lp_type type)
2574 {
2575 assert(type.floating);
2576
2577 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2578 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2579 return true;
2580 }
2581 return false;
2582 }
2583
2584
2585 /**
2586 * Generate 1/sqrt(a).
2587 * Result is undefined for values < 0, infinity for +0.
2588 * Precision is limited, only ~10 bits guaranteed
2589 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2590 */
2591 LLVMValueRef
2592 lp_build_fast_rsqrt(struct lp_build_context *bld,
2593 LLVMValueRef a)
2594 {
2595 LLVMBuilderRef builder = bld->gallivm->builder;
2596 const struct lp_type type = bld->type;
2597
2598 assert(lp_check_value(type, a));
2599
2600 if (lp_build_fast_rsqrt_available(type)) {
2601 const char *intrinsic = NULL;
2602
2603 if (type.length == 4) {
2604 intrinsic = "llvm.x86.sse.rsqrt.ps";
2605 }
2606 else {
2607 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2608 }
2609 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2610 }
2611 else {
2612 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2613 }
2614 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2615 }
2616
2617
2618 /**
2619 * Generate sin(a) or cos(a) using polynomial approximation.
2620 * TODO: it might be worth recognizing sin and cos using same source
2621 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2622 * would be way cheaper than calculating (nearly) everything twice...
2623 * Not sure it's common enough to be worth bothering however, scs
2624 * opcode could also benefit from calculating both though.
2625 */
2626 static LLVMValueRef
2627 lp_build_sin_or_cos(struct lp_build_context *bld,
2628 LLVMValueRef a,
2629 boolean cos)
2630 {
2631 struct gallivm_state *gallivm = bld->gallivm;
2632 LLVMBuilderRef b = gallivm->builder;
2633 struct lp_type int_type = lp_int_type(bld->type);
2634
2635 /*
2636 * take the absolute value,
2637 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2638 */
2639
2640 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2641 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2642
2643 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2644 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2645
2646 /*
2647 * scale by 4/Pi
2648 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2649 */
2650
2651 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2652 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2653
2654 /*
2655 * store the integer part of y in mm0
2656 * emm2 = _mm_cvttps_epi32(y);
2657 */
2658
2659 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2660
2661 /*
2662 * j=(j+1) & (~1) (see the cephes sources)
2663 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2664 */
2665
2666 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2667 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2668 /*
2669 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2670 */
2671 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2672 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2673
2674 /*
2675 * y = _mm_cvtepi32_ps(emm2);
2676 */
2677 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2678
2679 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2680 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2681 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2682 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2683
2684 /*
2685 * Argument used for poly selection and sign bit determination
2686 * is different for sin vs. cos.
2687 */
2688 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2689 emm2_and;
2690
2691 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2692 LLVMBuildNot(b, emm2_2, ""), ""),
2693 const_29, "sign_bit") :
2694 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2695 LLVMBuildShl(b, emm2_add,
2696 const_29, ""), ""),
2697 sign_mask, "sign_bit");
2698
2699 /*
2700 * get the polynom selection mask
2701 * there is one polynom for 0 <= x <= Pi/4
2702 * and another one for Pi/4<x<=Pi/2
2703 * Both branches will be computed.
2704 *
2705 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2706 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2707 */
2708
2709 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2710 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2711 int_type, PIPE_FUNC_EQUAL,
2712 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2713
2714 /*
2715 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2716 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2717 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2718 */
2719 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2720 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2721 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2722
2723 /*
2724 * The magic pass: "Extended precision modular arithmetic"
2725 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2726 * xmm1 = _mm_mul_ps(y, xmm1);
2727 * xmm2 = _mm_mul_ps(y, xmm2);
2728 * xmm3 = _mm_mul_ps(y, xmm3);
2729 */
2730 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2731 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2732 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2733
2734 /*
2735 * x = _mm_add_ps(x, xmm1);
2736 * x = _mm_add_ps(x, xmm2);
2737 * x = _mm_add_ps(x, xmm3);
2738 */
2739
2740 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2741 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2742 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2743
2744 /*
2745 * Evaluate the first polynom (0 <= x <= Pi/4)
2746 *
2747 * z = _mm_mul_ps(x,x);
2748 */
2749 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2750
2751 /*
2752 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2753 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2754 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2755 */
2756 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2757 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2758 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2759
2760 /*
2761 * y = *(v4sf*)_ps_coscof_p0;
2762 * y = _mm_mul_ps(y, z);
2763 */
2764 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2765 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2766 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2767 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2768 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2769 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2770
2771
2772 /*
2773 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2774 * y = _mm_sub_ps(y, tmp);
2775 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2776 */
2777 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2778 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2779 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2780 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2781 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2782
2783 /*
2784 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2785 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2786 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2787 */
2788 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2789 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2790 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2791
2792 /*
2793 * Evaluate the second polynom (Pi/4 <= x <= 0)
2794 *
2795 * y2 = *(v4sf*)_ps_sincof_p0;
2796 * y2 = _mm_mul_ps(y2, z);
2797 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2798 * y2 = _mm_mul_ps(y2, z);
2799 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2800 * y2 = _mm_mul_ps(y2, z);
2801 * y2 = _mm_mul_ps(y2, x);
2802 * y2 = _mm_add_ps(y2, x);
2803 */
2804
2805 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2806 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2807 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2808 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2809 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2810 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2811 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2812
2813 /*
2814 * select the correct result from the two polynoms
2815 * xmm3 = poly_mask;
2816 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2817 * y = _mm_andnot_ps(xmm3, y);
2818 * y = _mm_or_ps(y,y2);
2819 */
2820 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2821 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2822 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2823 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2824 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2825 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2826
2827 /*
2828 * update the sign
2829 * y = _mm_xor_ps(y, sign_bit);
2830 */
2831 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2832 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2833
2834 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2835
2836 /* clamp output to be within [-1, 1] */
2837 y_result = lp_build_clamp(bld, y_result,
2838 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2839 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2840 /* If a is -inf, inf or NaN then return NaN */
2841 y_result = lp_build_select(bld, isfinite, y_result,
2842 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2843 return y_result;
2844 }
2845
2846
2847 /**
2848 * Generate sin(a)
2849 */
2850 LLVMValueRef
2851 lp_build_sin(struct lp_build_context *bld,
2852 LLVMValueRef a)
2853 {
2854 return lp_build_sin_or_cos(bld, a, FALSE);
2855 }
2856
2857
2858 /**
2859 * Generate cos(a)
2860 */
2861 LLVMValueRef
2862 lp_build_cos(struct lp_build_context *bld,
2863 LLVMValueRef a)
2864 {
2865 return lp_build_sin_or_cos(bld, a, TRUE);
2866 }
2867
2868
2869 /**
2870 * Generate pow(x, y)
2871 */
2872 LLVMValueRef
2873 lp_build_pow(struct lp_build_context *bld,
2874 LLVMValueRef x,
2875 LLVMValueRef y)
2876 {
2877 /* TODO: optimize the constant case */
2878 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2879 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2880 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2881 __FUNCTION__);
2882 }
2883
2884 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2885 }
2886
2887
2888 /**
2889 * Generate exp(x)
2890 */
2891 LLVMValueRef
2892 lp_build_exp(struct lp_build_context *bld,
2893 LLVMValueRef x)
2894 {
2895 /* log2(e) = 1/log(2) */
2896 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2897 1.4426950408889634);
2898
2899 assert(lp_check_value(bld->type, x));
2900
2901 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2902 }
2903
2904
2905 /**
2906 * Generate log(x)
2907 * Behavior is undefined with infs, 0s and nans
2908 */
2909 LLVMValueRef
2910 lp_build_log(struct lp_build_context *bld,
2911 LLVMValueRef x)
2912 {
2913 /* log(2) */
2914 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2915 0.69314718055994529);
2916
2917 assert(lp_check_value(bld->type, x));
2918
2919 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2920 }
2921
2922 /**
2923 * Generate log(x) that handles edge cases (infs, 0s and nans)
2924 */
2925 LLVMValueRef
2926 lp_build_log_safe(struct lp_build_context *bld,
2927 LLVMValueRef x)
2928 {
2929 /* log(2) */
2930 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2931 0.69314718055994529);
2932
2933 assert(lp_check_value(bld->type, x));
2934
2935 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2936 }
2937
2938
2939 /**
2940 * Generate polynomial.
2941 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2942 */
2943 LLVMValueRef
2944 lp_build_polynomial(struct lp_build_context *bld,
2945 LLVMValueRef x,
2946 const double *coeffs,
2947 unsigned num_coeffs)
2948 {
2949 const struct lp_type type = bld->type;
2950 LLVMValueRef even = NULL, odd = NULL;
2951 LLVMValueRef x2;
2952 unsigned i;
2953
2954 assert(lp_check_value(bld->type, x));
2955
2956 /* TODO: optimize the constant case */
2957 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2958 LLVMIsConstant(x)) {
2959 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2960 __FUNCTION__);
2961 }
2962
2963 /*
2964 * Calculate odd and even terms seperately to decrease data dependency
2965 * Ex:
2966 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2967 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2968 */
2969 x2 = lp_build_mul(bld, x, x);
2970
2971 for (i = num_coeffs; i--; ) {
2972 LLVMValueRef coeff;
2973
2974 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2975
2976 if (i % 2 == 0) {
2977 if (even)
2978 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2979 else
2980 even = coeff;
2981 } else {
2982 if (odd)
2983 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2984 else
2985 odd = coeff;
2986 }
2987 }
2988
2989 if (odd)
2990 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2991 else if (even)
2992 return even;
2993 else
2994 return bld->undef;
2995 }
2996
2997
2998 /**
2999 * Minimax polynomial fit of 2**x, in range [0, 1[
3000 */
3001 const double lp_build_exp2_polynomial[] = {
3002 #if EXP_POLY_DEGREE == 5
3003 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3004 0.693153073200168932794,
3005 0.240153617044375388211,
3006 0.0558263180532956664775,
3007 0.00898934009049466391101,
3008 0.00187757667519147912699
3009 #elif EXP_POLY_DEGREE == 4
3010 1.00000259337069434683,
3011 0.693003834469974940458,
3012 0.24144275689150793076,
3013 0.0520114606103070150235,
3014 0.0135341679161270268764
3015 #elif EXP_POLY_DEGREE == 3
3016 0.999925218562710312959,
3017 0.695833540494823811697,
3018 0.226067155427249155588,
3019 0.0780245226406372992967
3020 #elif EXP_POLY_DEGREE == 2
3021 1.00172476321474503578,
3022 0.657636275736077639316,
3023 0.33718943461968720704
3024 #else
3025 #error
3026 #endif
3027 };
3028
3029
3030 LLVMValueRef
3031 lp_build_exp2(struct lp_build_context *bld,
3032 LLVMValueRef x)
3033 {
3034 LLVMBuilderRef builder = bld->gallivm->builder;
3035 const struct lp_type type = bld->type;
3036 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3037 LLVMValueRef ipart = NULL;
3038 LLVMValueRef fpart = NULL;
3039 LLVMValueRef expipart = NULL;
3040 LLVMValueRef expfpart = NULL;
3041 LLVMValueRef res = NULL;
3042
3043 assert(lp_check_value(bld->type, x));
3044
3045
3046 /* TODO: optimize the constant case */
3047 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3048 LLVMIsConstant(x)) {
3049 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3050 __FUNCTION__);
3051 }
3052
3053 assert(type.floating && type.width == 32);
3054
3055 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3056 * the result is INF and if it's smaller than -126.9 the result is 0 */
3057 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3058 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
3059 x = lp_build_max(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x);
3060
3061 /* ipart = floor(x) */
3062 /* fpart = x - ipart */
3063 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3064
3065
3066
3067 /* expipart = (float) (1 << ipart) */
3068 expipart = LLVMBuildAdd(builder, ipart,
3069 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3070 expipart = LLVMBuildShl(builder, expipart,
3071 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3072 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3073
3074
3075 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3076 Elements(lp_build_exp2_polynomial));
3077
3078 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3079
3080
3081 return res;
3082 }
3083
3084
3085
3086 /**
3087 * Extract the exponent of a IEEE-754 floating point value.
3088 *
3089 * Optionally apply an integer bias.
3090 *
3091 * Result is an integer value with
3092 *
3093 * ifloor(log2(x)) + bias
3094 */
3095 LLVMValueRef
3096 lp_build_extract_exponent(struct lp_build_context *bld,
3097 LLVMValueRef x,
3098 int bias)
3099 {
3100 LLVMBuilderRef builder = bld->gallivm->builder;
3101 const struct lp_type type = bld->type;
3102 unsigned mantissa = lp_mantissa(type);
3103 LLVMValueRef res;
3104
3105 assert(type.floating);
3106
3107 assert(lp_check_value(bld->type, x));
3108
3109 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3110
3111 res = LLVMBuildLShr(builder, x,
3112 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3113 res = LLVMBuildAnd(builder, res,
3114 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3115 res = LLVMBuildSub(builder, res,
3116 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3117
3118 return res;
3119 }
3120
3121
3122 /**
3123 * Extract the mantissa of the a floating.
3124 *
3125 * Result is a floating point value with
3126 *
3127 * x / floor(log2(x))
3128 */
3129 LLVMValueRef
3130 lp_build_extract_mantissa(struct lp_build_context *bld,
3131 LLVMValueRef x)
3132 {
3133 LLVMBuilderRef builder = bld->gallivm->builder;
3134 const struct lp_type type = bld->type;
3135 unsigned mantissa = lp_mantissa(type);
3136 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3137 (1ULL << mantissa) - 1);
3138 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3139 LLVMValueRef res;
3140
3141 assert(lp_check_value(bld->type, x));
3142
3143 assert(type.floating);
3144
3145 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3146
3147 /* res = x / 2**ipart */
3148 res = LLVMBuildAnd(builder, x, mantmask, "");
3149 res = LLVMBuildOr(builder, res, one, "");
3150 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3151
3152 return res;
3153 }
3154
3155
3156
3157 /**
3158 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3159 * These coefficients can be generate with
3160 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3161 */
3162 const double lp_build_log2_polynomial[] = {
3163 #if LOG_POLY_DEGREE == 5
3164 2.88539008148777786488L,
3165 0.961796878841293367824L,
3166 0.577058946784739859012L,
3167 0.412914355135828735411L,
3168 0.308591899232910175289L,
3169 0.352376952300281371868L,
3170 #elif LOG_POLY_DEGREE == 4
3171 2.88539009343309178325L,
3172 0.961791550404184197881L,
3173 0.577440339438736392009L,
3174 0.403343858251329912514L,
3175 0.406718052498846252698L,
3176 #elif LOG_POLY_DEGREE == 3
3177 2.88538959748872753838L,
3178 0.961932915889597772928L,
3179 0.571118517972136195241L,
3180 0.493997535084709500285L,
3181 #else
3182 #error
3183 #endif
3184 };
3185
3186 /**
3187 * See http://www.devmaster.net/forums/showthread.php?p=43580
3188 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3189 * http://www.nezumi.demon.co.uk/consult/logx.htm
3190 *
3191 * If handle_edge_cases is true the function will perform computations
3192 * to match the required D3D10+ behavior for each of the edge cases.
3193 * That means that if input is:
3194 * - less than zero (to and including -inf) then NaN will be returned
3195 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3196 * - +infinity, then +infinity will be returned
3197 * - NaN, then NaN will be returned
3198 *
3199 * Those checks are fairly expensive so if you don't need them make sure
3200 * handle_edge_cases is false.
3201 */
3202 void
3203 lp_build_log2_approx(struct lp_build_context *bld,
3204 LLVMValueRef x,
3205 LLVMValueRef *p_exp,
3206 LLVMValueRef *p_floor_log2,
3207 LLVMValueRef *p_log2,
3208 boolean handle_edge_cases)
3209 {
3210 LLVMBuilderRef builder = bld->gallivm->builder;
3211 const struct lp_type type = bld->type;
3212 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3213 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3214
3215 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3216 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3217 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3218
3219 LLVMValueRef i = NULL;
3220 LLVMValueRef y = NULL;
3221 LLVMValueRef z = NULL;
3222 LLVMValueRef exp = NULL;
3223 LLVMValueRef mant = NULL;
3224 LLVMValueRef logexp = NULL;
3225 LLVMValueRef logmant = NULL;
3226 LLVMValueRef res = NULL;
3227
3228 assert(lp_check_value(bld->type, x));
3229
3230 if(p_exp || p_floor_log2 || p_log2) {
3231 /* TODO: optimize the constant case */
3232 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3233 LLVMIsConstant(x)) {
3234 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3235 __FUNCTION__);
3236 }
3237
3238 assert(type.floating && type.width == 32);
3239
3240 /*
3241 * We don't explicitly handle denormalized numbers. They will yield a
3242 * result in the neighbourhood of -127, which appears to be adequate
3243 * enough.
3244 */
3245
3246 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3247
3248 /* exp = (float) exponent(x) */
3249 exp = LLVMBuildAnd(builder, i, expmask, "");
3250 }
3251
3252 if(p_floor_log2 || p_log2) {
3253 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3254 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3255 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3256 }
3257
3258 if(p_log2) {
3259 /* mant = 1 + (float) mantissa(x) */
3260 mant = LLVMBuildAnd(builder, i, mantmask, "");
3261 mant = LLVMBuildOr(builder, mant, one, "");
3262 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3263
3264 /* y = (mant - 1) / (mant + 1) */
3265 y = lp_build_div(bld,
3266 lp_build_sub(bld, mant, bld->one),
3267 lp_build_add(bld, mant, bld->one)
3268 );
3269
3270 /* z = y^2 */
3271 z = lp_build_mul(bld, y, y);
3272
3273 /* compute P(z) */
3274 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3275 Elements(lp_build_log2_polynomial));
3276
3277 /* logmant = y * P(z) */
3278 logmant = lp_build_mul(bld, y, logmant);
3279
3280 res = lp_build_add(bld, logmant, logexp);
3281
3282 if (type.floating && handle_edge_cases) {
3283 LLVMValueRef negmask, infmask, zmask;
3284 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3285 lp_build_const_vec(bld->gallivm, type, 0.0f));
3286 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3287 lp_build_const_vec(bld->gallivm, type, 0.0f));
3288 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3289 lp_build_const_vec(bld->gallivm, type, INFINITY));
3290
3291 /* If x is qual to inf make sure we return inf */
3292 res = lp_build_select(bld, infmask,
3293 lp_build_const_vec(bld->gallivm, type, INFINITY),
3294 res);
3295 /* If x is qual to 0, return -inf */
3296 res = lp_build_select(bld, zmask,
3297 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3298 res);
3299 /* If x is nan or less than 0, return nan */
3300 res = lp_build_select(bld, negmask,
3301 lp_build_const_vec(bld->gallivm, type, NAN),
3302 res);
3303 }
3304 }
3305
3306 if(p_exp) {
3307 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3308 *p_exp = exp;
3309 }
3310
3311 if(p_floor_log2)
3312 *p_floor_log2 = logexp;
3313
3314 if(p_log2)
3315 *p_log2 = res;
3316 }
3317
3318
3319 /*
3320 * log2 implementation which doesn't have special code to
3321 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3322 * the results for those cases are undefined.
3323 */
3324 LLVMValueRef
3325 lp_build_log2(struct lp_build_context *bld,
3326 LLVMValueRef x)
3327 {
3328 LLVMValueRef res;
3329 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3330 return res;
3331 }
3332
3333 /*
3334 * Version of log2 which handles all edge cases.
3335 * Look at documentation of lp_build_log2_approx for
3336 * description of the behavior for each of the edge cases.
3337 */
3338 LLVMValueRef
3339 lp_build_log2_safe(struct lp_build_context *bld,
3340 LLVMValueRef x)
3341 {
3342 LLVMValueRef res;
3343 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3344 return res;
3345 }
3346
3347
3348 /**
3349 * Faster (and less accurate) log2.
3350 *
3351 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3352 *
3353 * Piece-wise linear approximation, with exact results when x is a
3354 * power of two.
3355 *
3356 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3357 */
3358 LLVMValueRef
3359 lp_build_fast_log2(struct lp_build_context *bld,
3360 LLVMValueRef x)
3361 {
3362 LLVMBuilderRef builder = bld->gallivm->builder;
3363 LLVMValueRef ipart;
3364 LLVMValueRef fpart;
3365
3366 assert(lp_check_value(bld->type, x));
3367
3368 assert(bld->type.floating);
3369
3370 /* ipart = floor(log2(x)) - 1 */
3371 ipart = lp_build_extract_exponent(bld, x, -1);
3372 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3373
3374 /* fpart = x / 2**ipart */
3375 fpart = lp_build_extract_mantissa(bld, x);
3376
3377 /* ipart + fpart */
3378 return LLVMBuildFAdd(builder, ipart, fpart, "");
3379 }
3380
3381
3382 /**
3383 * Fast implementation of iround(log2(x)).
3384 *
3385 * Not an approximation -- it should give accurate results all the time.
3386 */
3387 LLVMValueRef
3388 lp_build_ilog2(struct lp_build_context *bld,
3389 LLVMValueRef x)
3390 {
3391 LLVMBuilderRef builder = bld->gallivm->builder;
3392 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3393 LLVMValueRef ipart;
3394
3395 assert(bld->type.floating);
3396
3397 assert(lp_check_value(bld->type, x));
3398
3399 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3400 x = LLVMBuildFMul(builder, x, sqrt2, "");
3401
3402 /* ipart = floor(log2(x) + 0.5) */
3403 ipart = lp_build_extract_exponent(bld, x, 0);
3404
3405 return ipart;
3406 }
3407
3408 LLVMValueRef
3409 lp_build_mod(struct lp_build_context *bld,
3410 LLVMValueRef x,
3411 LLVMValueRef y)
3412 {
3413 LLVMBuilderRef builder = bld->gallivm->builder;
3414 LLVMValueRef res;
3415 const struct lp_type type = bld->type;
3416
3417 assert(lp_check_value(type, x));
3418 assert(lp_check_value(type, y));
3419
3420 if (type.floating)
3421 res = LLVMBuildFRem(builder, x, y, "");
3422 else if (type.sign)
3423 res = LLVMBuildSRem(builder, x, y, "");
3424 else
3425 res = LLVMBuildURem(builder, x, y, "");
3426 return res;
3427 }
3428
3429
3430 /*
3431 * For floating inputs it creates and returns a mask
3432 * which is all 1's for channels which are NaN.
3433 * Channels inside x which are not NaN will be 0.
3434 */
3435 LLVMValueRef
3436 lp_build_isnan(struct lp_build_context *bld,
3437 LLVMValueRef x)
3438 {
3439 LLVMValueRef mask;
3440 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3441
3442 assert(bld->type.floating);
3443 assert(lp_check_value(bld->type, x));
3444
3445 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3446 "isnotnan");
3447 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3448 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3449 return mask;
3450 }
3451
3452 /* Returns all 1's for floating point numbers that are
3453 * finite numbers and returns all zeros for -inf,
3454 * inf and nan's */
3455 LLVMValueRef
3456 lp_build_isfinite(struct lp_build_context *bld,
3457 LLVMValueRef x)
3458 {
3459 LLVMBuilderRef builder = bld->gallivm->builder;
3460 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3461 struct lp_type int_type = lp_int_type(bld->type);
3462 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3463 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3464 0x7f800000);
3465
3466 if (!bld->type.floating) {
3467 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3468 }
3469 assert(bld->type.floating);
3470 assert(lp_check_value(bld->type, x));
3471 assert(bld->type.width == 32);
3472
3473 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3474 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3475 intx, infornan32);
3476 }
3477
3478 /*
3479 * Returns true if the number is nan or inf and false otherwise.
3480 * The input has to be a floating point vector.
3481 */
3482 LLVMValueRef
3483 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3484 const struct lp_type type,
3485 LLVMValueRef x)
3486 {
3487 LLVMBuilderRef builder = gallivm->builder;
3488 struct lp_type int_type = lp_int_type(type);
3489 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3490 0x7f800000);
3491 LLVMValueRef ret;
3492
3493 assert(type.floating);
3494
3495 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3496 ret = LLVMBuildAnd(builder, ret, const0, "");
3497 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3498 ret, const0);
3499
3500 return ret;
3501 }
3502
3503
3504 LLVMValueRef
3505 lp_build_fpstate_get(struct gallivm_state *gallivm)
3506 {
3507 if (util_cpu_caps.has_sse) {
3508 LLVMBuilderRef builder = gallivm->builder;
3509 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3510 gallivm,
3511 LLVMInt32TypeInContext(gallivm->context),
3512 "mxcsr_ptr");
3513 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3514 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3515 lp_build_intrinsic(builder,
3516 "llvm.x86.sse.stmxcsr",
3517 LLVMVoidTypeInContext(gallivm->context),
3518 &mxcsr_ptr8, 1);
3519 return mxcsr_ptr;
3520 }
3521 return 0;
3522 }
3523
3524 void
3525 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3526 boolean zero)
3527 {
3528 if (util_cpu_caps.has_sse) {
3529 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3530 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3531
3532 LLVMBuilderRef builder = gallivm->builder;
3533 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3534 LLVMValueRef mxcsr =
3535 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3536
3537 if (util_cpu_caps.has_daz) {
3538 /* Enable denormals are zero mode */
3539 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3540 }
3541 if (zero) {
3542 mxcsr = LLVMBuildOr(builder, mxcsr,
3543 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3544 } else {
3545 mxcsr = LLVMBuildAnd(builder, mxcsr,
3546 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3547 }
3548
3549 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3550 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3551 }
3552 }
3553
3554 void
3555 lp_build_fpstate_set(struct gallivm_state *gallivm,
3556 LLVMValueRef mxcsr_ptr)
3557 {
3558 if (util_cpu_caps.has_sse) {
3559 LLVMBuilderRef builder = gallivm->builder;
3560 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3561 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3562 lp_build_intrinsic(builder,
3563 "llvm.x86.sse.ldmxcsr",
3564 LLVMVoidTypeInContext(gallivm->context),
3565 &mxcsr_ptr, 1);
3566 }
3567 }