gallivm,draw,llvmpipe: Remove support for versions of LLVM prior to 3.1.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
69 #endif
70
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
73 #endif
74
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
77 #endif
78
79 #define EXP_POLY_DEGREE 5
80
81 #define LOG_POLY_DEGREE 4
82
83
84 /**
85 * Generate min(a, b)
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
89 */
90 static LLVMValueRef
91 lp_build_min_simple(struct lp_build_context *bld,
92 LLVMValueRef a,
93 LLVMValueRef b,
94 enum gallivm_nan_behavior nan_behavior)
95 {
96 const struct lp_type type = bld->type;
97 const char *intrinsic = NULL;
98 unsigned intr_size = 0;
99 LLVMValueRef cond;
100
101 assert(lp_check_value(type, a));
102 assert(lp_check_value(type, b));
103
104 /* TODO: optimize the constant case */
105
106 if (type.floating && util_cpu_caps.has_sse) {
107 if (type.width == 32) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse.min.ss";
110 intr_size = 128;
111 }
112 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse.min.ps";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.ps.256";
118 intr_size = 256;
119 }
120 }
121 if (type.width == 64 && util_cpu_caps.has_sse2) {
122 if (type.length == 1) {
123 intrinsic = "llvm.x86.sse2.min.sd";
124 intr_size = 128;
125 }
126 else if (type.length == 2 || !util_cpu_caps.has_avx) {
127 intrinsic = "llvm.x86.sse2.min.pd";
128 intr_size = 128;
129 }
130 else {
131 intrinsic = "llvm.x86.avx.min.pd.256";
132 intr_size = 256;
133 }
134 }
135 }
136 else if (type.floating && util_cpu_caps.has_altivec) {
137 if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
146 intr_size = 128;
147 if ((type.width == 8 || type.width == 16) &&
148 (type.width * type.length <= 64) &&
149 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
150 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
151 __FUNCTION__);
152 }
153 if (type.width == 8 && !type.sign) {
154 intrinsic = "llvm.x86.sse2.pminu.b";
155 }
156 else if (type.width == 16 && type.sign) {
157 intrinsic = "llvm.x86.sse2.pmins.w";
158 }
159 if (util_cpu_caps.has_sse4_1) {
160 if (type.width == 8 && type.sign) {
161 intrinsic = "llvm.x86.sse41.pminsb";
162 }
163 if (type.width == 16 && !type.sign) {
164 intrinsic = "llvm.x86.sse41.pminuw";
165 }
166 if (type.width == 32 && !type.sign) {
167 intrinsic = "llvm.x86.sse41.pminud";
168 }
169 if (type.width == 32 && type.sign) {
170 intrinsic = "llvm.x86.sse41.pminsd";
171 }
172 }
173 } else if (util_cpu_caps.has_altivec) {
174 intr_size = 128;
175 if (type.width == 8) {
176 if (!type.sign) {
177 intrinsic = "llvm.ppc.altivec.vminub";
178 } else {
179 intrinsic = "llvm.ppc.altivec.vminsb";
180 }
181 } else if (type.width == 16) {
182 if (!type.sign) {
183 intrinsic = "llvm.ppc.altivec.vminuh";
184 } else {
185 intrinsic = "llvm.ppc.altivec.vminsh";
186 }
187 } else if (type.width == 32) {
188 if (!type.sign) {
189 intrinsic = "llvm.ppc.altivec.vminuw";
190 } else {
191 intrinsic = "llvm.ppc.altivec.vminsw";
192 }
193 }
194 }
195
196 if(intrinsic) {
197 /* We need to handle nan's for floating point numbers. If one of the
198 * inputs is nan the other should be returned (required by both D3D10+
199 * and OpenCL).
200 * The sse intrinsics return the second operator in case of nan by
201 * default so we need to special code to handle those.
202 */
203 if (util_cpu_caps.has_sse && type.floating &&
204 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
205 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
206 LLVMValueRef isnan, max;
207 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
208 type,
209 intr_size, a, b);
210 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
211 isnan = lp_build_isnan(bld, b);
212 return lp_build_select(bld, isnan, a, max);
213 } else {
214 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
215 isnan = lp_build_isnan(bld, a);
216 return lp_build_select(bld, isnan, a, max);
217 }
218 } else {
219 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
220 type,
221 intr_size, a, b);
222 }
223 }
224
225 if (type.floating) {
226 switch (nan_behavior) {
227 case GALLIVM_NAN_RETURN_NAN: {
228 LLVMValueRef isnan = lp_build_isnan(bld, b);
229 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
230 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
231 return lp_build_select(bld, cond, a, b);
232 }
233 break;
234 case GALLIVM_NAN_RETURN_OTHER: {
235 LLVMValueRef isnan = lp_build_isnan(bld, a);
236 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
237 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
238 return lp_build_select(bld, cond, a, b);
239 }
240 break;
241 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
242 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
243 return lp_build_select(bld, cond, a, b);
244 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 return lp_build_select(bld, cond, a, b);
247 break;
248 default:
249 assert(0);
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 }
253 } else {
254 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
255 return lp_build_select(bld, cond, a, b);
256 }
257 }
258
259
260 /**
261 * Generate max(a, b)
262 * No checks for special case values of a or b = 1 or 0 are done.
263 * NaN's are handled according to the behavior specified by the
264 * nan_behavior argument.
265 */
266 static LLVMValueRef
267 lp_build_max_simple(struct lp_build_context *bld,
268 LLVMValueRef a,
269 LLVMValueRef b,
270 enum gallivm_nan_behavior nan_behavior)
271 {
272 const struct lp_type type = bld->type;
273 const char *intrinsic = NULL;
274 unsigned intr_size = 0;
275 LLVMValueRef cond;
276
277 assert(lp_check_value(type, a));
278 assert(lp_check_value(type, b));
279
280 /* TODO: optimize the constant case */
281
282 if (type.floating && util_cpu_caps.has_sse) {
283 if (type.width == 32) {
284 if (type.length == 1) {
285 intrinsic = "llvm.x86.sse.max.ss";
286 intr_size = 128;
287 }
288 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
289 intrinsic = "llvm.x86.sse.max.ps";
290 intr_size = 128;
291 }
292 else {
293 intrinsic = "llvm.x86.avx.max.ps.256";
294 intr_size = 256;
295 }
296 }
297 if (type.width == 64 && util_cpu_caps.has_sse2) {
298 if (type.length == 1) {
299 intrinsic = "llvm.x86.sse2.max.sd";
300 intr_size = 128;
301 }
302 else if (type.length == 2 || !util_cpu_caps.has_avx) {
303 intrinsic = "llvm.x86.sse2.max.pd";
304 intr_size = 128;
305 }
306 else {
307 intrinsic = "llvm.x86.avx.max.pd.256";
308 intr_size = 256;
309 }
310 }
311 }
312 else if (type.floating && util_cpu_caps.has_altivec) {
313 if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
314 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
315 __FUNCTION__);
316 }
317 if (type.width == 32 || type.length == 4) {
318 intrinsic = "llvm.ppc.altivec.vmaxfp";
319 intr_size = 128;
320 }
321 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
322 intr_size = 128;
323 if ((type.width == 8 || type.width == 16) &&
324 (type.width * type.length <= 64) &&
325 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
326 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
327 __FUNCTION__);
328 }
329 if (type.width == 8 && !type.sign) {
330 intrinsic = "llvm.x86.sse2.pmaxu.b";
331 intr_size = 128;
332 }
333 else if (type.width == 16 && type.sign) {
334 intrinsic = "llvm.x86.sse2.pmaxs.w";
335 }
336 if (util_cpu_caps.has_sse4_1) {
337 if (type.width == 8 && type.sign) {
338 intrinsic = "llvm.x86.sse41.pmaxsb";
339 }
340 if (type.width == 16 && !type.sign) {
341 intrinsic = "llvm.x86.sse41.pmaxuw";
342 }
343 if (type.width == 32 && !type.sign) {
344 intrinsic = "llvm.x86.sse41.pmaxud";
345 }
346 if (type.width == 32 && type.sign) {
347 intrinsic = "llvm.x86.sse41.pmaxsd";
348 }
349 }
350 } else if (util_cpu_caps.has_altivec) {
351 intr_size = 128;
352 if (type.width == 8) {
353 if (!type.sign) {
354 intrinsic = "llvm.ppc.altivec.vmaxub";
355 } else {
356 intrinsic = "llvm.ppc.altivec.vmaxsb";
357 }
358 } else if (type.width == 16) {
359 if (!type.sign) {
360 intrinsic = "llvm.ppc.altivec.vmaxuh";
361 } else {
362 intrinsic = "llvm.ppc.altivec.vmaxsh";
363 }
364 } else if (type.width == 32) {
365 if (!type.sign) {
366 intrinsic = "llvm.ppc.altivec.vmaxuw";
367 } else {
368 intrinsic = "llvm.ppc.altivec.vmaxsw";
369 }
370 }
371 }
372
373 if(intrinsic) {
374 if (util_cpu_caps.has_sse && type.floating &&
375 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
376 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
377 LLVMValueRef isnan, min;
378 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
379 type,
380 intr_size, a, b);
381 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
382 isnan = lp_build_isnan(bld, b);
383 return lp_build_select(bld, isnan, a, min);
384 } else {
385 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
386 isnan = lp_build_isnan(bld, a);
387 return lp_build_select(bld, isnan, a, min);
388 }
389 } else {
390 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
391 type,
392 intr_size, a, b);
393 }
394 }
395
396 if (type.floating) {
397 switch (nan_behavior) {
398 case GALLIVM_NAN_RETURN_NAN: {
399 LLVMValueRef isnan = lp_build_isnan(bld, b);
400 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
401 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
402 return lp_build_select(bld, cond, a, b);
403 }
404 break;
405 case GALLIVM_NAN_RETURN_OTHER: {
406 LLVMValueRef isnan = lp_build_isnan(bld, a);
407 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
408 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
409 return lp_build_select(bld, cond, a, b);
410 }
411 break;
412 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
413 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
414 return lp_build_select(bld, cond, a, b);
415 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
416 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
417 return lp_build_select(bld, cond, a, b);
418 break;
419 default:
420 assert(0);
421 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
422 return lp_build_select(bld, cond, a, b);
423 }
424 } else {
425 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
426 return lp_build_select(bld, cond, a, b);
427 }
428 }
429
430
431 /**
432 * Generate 1 - a, or ~a depending on bld->type.
433 */
434 LLVMValueRef
435 lp_build_comp(struct lp_build_context *bld,
436 LLVMValueRef a)
437 {
438 LLVMBuilderRef builder = bld->gallivm->builder;
439 const struct lp_type type = bld->type;
440
441 assert(lp_check_value(type, a));
442
443 if(a == bld->one)
444 return bld->zero;
445 if(a == bld->zero)
446 return bld->one;
447
448 if(type.norm && !type.floating && !type.fixed && !type.sign) {
449 if(LLVMIsConstant(a))
450 return LLVMConstNot(a);
451 else
452 return LLVMBuildNot(builder, a, "");
453 }
454
455 if(LLVMIsConstant(a))
456 if (type.floating)
457 return LLVMConstFSub(bld->one, a);
458 else
459 return LLVMConstSub(bld->one, a);
460 else
461 if (type.floating)
462 return LLVMBuildFSub(builder, bld->one, a, "");
463 else
464 return LLVMBuildSub(builder, bld->one, a, "");
465 }
466
467
468 /**
469 * Generate a + b
470 */
471 LLVMValueRef
472 lp_build_add(struct lp_build_context *bld,
473 LLVMValueRef a,
474 LLVMValueRef b)
475 {
476 LLVMBuilderRef builder = bld->gallivm->builder;
477 const struct lp_type type = bld->type;
478 LLVMValueRef res;
479
480 assert(lp_check_value(type, a));
481 assert(lp_check_value(type, b));
482
483 if(a == bld->zero)
484 return b;
485 if(b == bld->zero)
486 return a;
487 if(a == bld->undef || b == bld->undef)
488 return bld->undef;
489
490 if(bld->type.norm) {
491 const char *intrinsic = NULL;
492
493 if(a == bld->one || b == bld->one)
494 return bld->one;
495
496 if (type.width * type.length == 128 &&
497 !type.floating && !type.fixed) {
498 if(util_cpu_caps.has_sse2) {
499 if(type.width == 8)
500 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
501 if(type.width == 16)
502 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
503 } else if (util_cpu_caps.has_altivec) {
504 if(type.width == 8)
505 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
506 if(type.width == 16)
507 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
508 }
509 }
510
511 if(intrinsic)
512 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
513 }
514
515 /* TODO: handle signed case */
516 if(type.norm && !type.floating && !type.fixed && !type.sign)
517 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
518
519 if(LLVMIsConstant(a) && LLVMIsConstant(b))
520 if (type.floating)
521 res = LLVMConstFAdd(a, b);
522 else
523 res = LLVMConstAdd(a, b);
524 else
525 if (type.floating)
526 res = LLVMBuildFAdd(builder, a, b, "");
527 else
528 res = LLVMBuildAdd(builder, a, b, "");
529
530 /* clamp to ceiling of 1.0 */
531 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
532 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
533
534 /* XXX clamp to floor of -1 or 0??? */
535
536 return res;
537 }
538
539
540 /** Return the scalar sum of the elements of a.
541 * Should avoid this operation whenever possible.
542 */
543 LLVMValueRef
544 lp_build_horizontal_add(struct lp_build_context *bld,
545 LLVMValueRef a)
546 {
547 LLVMBuilderRef builder = bld->gallivm->builder;
548 const struct lp_type type = bld->type;
549 LLVMValueRef index, res;
550 unsigned i, length;
551 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
552 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
553 LLVMValueRef vecres, elem2;
554
555 assert(lp_check_value(type, a));
556
557 if (type.length == 1) {
558 return a;
559 }
560
561 assert(!bld->type.norm);
562
563 /*
564 * for byte vectors can do much better with psadbw.
565 * Using repeated shuffle/adds here. Note with multiple vectors
566 * this can be done more efficiently as outlined in the intel
567 * optimization manual.
568 * Note: could cause data rearrangement if used with smaller element
569 * sizes.
570 */
571
572 vecres = a;
573 length = type.length / 2;
574 while (length > 1) {
575 LLVMValueRef vec1, vec2;
576 for (i = 0; i < length; i++) {
577 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
578 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
579 }
580 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
581 LLVMConstVector(shuffles1, length), "");
582 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
583 LLVMConstVector(shuffles2, length), "");
584 if (type.floating) {
585 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
586 }
587 else {
588 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
589 }
590 length = length >> 1;
591 }
592
593 /* always have vector of size 2 here */
594 assert(length == 1);
595
596 index = lp_build_const_int32(bld->gallivm, 0);
597 res = LLVMBuildExtractElement(builder, vecres, index, "");
598 index = lp_build_const_int32(bld->gallivm, 1);
599 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
600
601 if (type.floating)
602 res = LLVMBuildFAdd(builder, res, elem2, "");
603 else
604 res = LLVMBuildAdd(builder, res, elem2, "");
605
606 return res;
607 }
608
609 /**
610 * Return the horizontal sums of 4 float vectors as a float4 vector.
611 * This uses the technique as outlined in Intel Optimization Manual.
612 */
613 static LLVMValueRef
614 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
615 LLVMValueRef src[4])
616 {
617 struct gallivm_state *gallivm = bld->gallivm;
618 LLVMBuilderRef builder = gallivm->builder;
619 LLVMValueRef shuffles[4];
620 LLVMValueRef tmp[4];
621 LLVMValueRef sumtmp[2], shuftmp[2];
622
623 /* lower half of regs */
624 shuffles[0] = lp_build_const_int32(gallivm, 0);
625 shuffles[1] = lp_build_const_int32(gallivm, 1);
626 shuffles[2] = lp_build_const_int32(gallivm, 4);
627 shuffles[3] = lp_build_const_int32(gallivm, 5);
628 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
629 LLVMConstVector(shuffles, 4), "");
630 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
631 LLVMConstVector(shuffles, 4), "");
632
633 /* upper half of regs */
634 shuffles[0] = lp_build_const_int32(gallivm, 2);
635 shuffles[1] = lp_build_const_int32(gallivm, 3);
636 shuffles[2] = lp_build_const_int32(gallivm, 6);
637 shuffles[3] = lp_build_const_int32(gallivm, 7);
638 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
639 LLVMConstVector(shuffles, 4), "");
640 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
641 LLVMConstVector(shuffles, 4), "");
642
643 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
644 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
645
646 shuffles[0] = lp_build_const_int32(gallivm, 0);
647 shuffles[1] = lp_build_const_int32(gallivm, 2);
648 shuffles[2] = lp_build_const_int32(gallivm, 4);
649 shuffles[3] = lp_build_const_int32(gallivm, 6);
650 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
651 LLVMConstVector(shuffles, 4), "");
652
653 shuffles[0] = lp_build_const_int32(gallivm, 1);
654 shuffles[1] = lp_build_const_int32(gallivm, 3);
655 shuffles[2] = lp_build_const_int32(gallivm, 5);
656 shuffles[3] = lp_build_const_int32(gallivm, 7);
657 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
658 LLVMConstVector(shuffles, 4), "");
659
660 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
661 }
662
663
664 /*
665 * partially horizontally add 2-4 float vectors with length nx4,
666 * i.e. only four adjacent values in each vector will be added,
667 * assuming values are really grouped in 4 which also determines
668 * output order.
669 *
670 * Return a vector of the same length as the initial vectors,
671 * with the excess elements (if any) being undefined.
672 * The element order is independent of number of input vectors.
673 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
674 * the output order thus will be
675 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
676 */
677 LLVMValueRef
678 lp_build_hadd_partial4(struct lp_build_context *bld,
679 LLVMValueRef vectors[],
680 unsigned num_vecs)
681 {
682 struct gallivm_state *gallivm = bld->gallivm;
683 LLVMBuilderRef builder = gallivm->builder;
684 LLVMValueRef ret_vec;
685 LLVMValueRef tmp[4];
686 const char *intrinsic = NULL;
687
688 assert(num_vecs >= 2 && num_vecs <= 4);
689 assert(bld->type.floating);
690
691 /* only use this with at least 2 vectors, as it is sort of expensive
692 * (depending on cpu) and we always need two horizontal adds anyway,
693 * so a shuffle/add approach might be better.
694 */
695
696 tmp[0] = vectors[0];
697 tmp[1] = vectors[1];
698
699 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
700 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
701
702 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
703 bld->type.length == 4) {
704 intrinsic = "llvm.x86.sse3.hadd.ps";
705 }
706 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
707 bld->type.length == 8) {
708 intrinsic = "llvm.x86.avx.hadd.ps.256";
709 }
710 if (intrinsic) {
711 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
712 lp_build_vec_type(gallivm, bld->type),
713 tmp[0], tmp[1]);
714 if (num_vecs > 2) {
715 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
716 lp_build_vec_type(gallivm, bld->type),
717 tmp[2], tmp[3]);
718 }
719 else {
720 tmp[1] = tmp[0];
721 }
722 return lp_build_intrinsic_binary(builder, intrinsic,
723 lp_build_vec_type(gallivm, bld->type),
724 tmp[0], tmp[1]);
725 }
726
727 if (bld->type.length == 4) {
728 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
729 }
730 else {
731 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
732 unsigned j;
733 unsigned num_iter = bld->type.length / 4;
734 struct lp_type parttype = bld->type;
735 parttype.length = 4;
736 for (j = 0; j < num_iter; j++) {
737 LLVMValueRef partsrc[4];
738 unsigned i;
739 for (i = 0; i < 4; i++) {
740 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
741 }
742 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
743 }
744 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
745 }
746 return ret_vec;
747 }
748
749 /**
750 * Generate a - b
751 */
752 LLVMValueRef
753 lp_build_sub(struct lp_build_context *bld,
754 LLVMValueRef a,
755 LLVMValueRef b)
756 {
757 LLVMBuilderRef builder = bld->gallivm->builder;
758 const struct lp_type type = bld->type;
759 LLVMValueRef res;
760
761 assert(lp_check_value(type, a));
762 assert(lp_check_value(type, b));
763
764 if(b == bld->zero)
765 return a;
766 if(a == bld->undef || b == bld->undef)
767 return bld->undef;
768 if(a == b)
769 return bld->zero;
770
771 if(bld->type.norm) {
772 const char *intrinsic = NULL;
773
774 if(b == bld->one)
775 return bld->zero;
776
777 if (type.width * type.length == 128 &&
778 !type.floating && !type.fixed) {
779 if (util_cpu_caps.has_sse2) {
780 if(type.width == 8)
781 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
782 if(type.width == 16)
783 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
784 } else if (util_cpu_caps.has_altivec) {
785 if(type.width == 8)
786 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
787 if(type.width == 16)
788 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
789 }
790 }
791
792 if(intrinsic)
793 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
794 }
795
796 /* TODO: handle signed case */
797 if(type.norm && !type.floating && !type.fixed && !type.sign)
798 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
799
800 if(LLVMIsConstant(a) && LLVMIsConstant(b))
801 if (type.floating)
802 res = LLVMConstFSub(a, b);
803 else
804 res = LLVMConstSub(a, b);
805 else
806 if (type.floating)
807 res = LLVMBuildFSub(builder, a, b, "");
808 else
809 res = LLVMBuildSub(builder, a, b, "");
810
811 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
812 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
813
814 return res;
815 }
816
817
818
819 /**
820 * Normalized multiplication.
821 *
822 * There are several approaches for (using 8-bit normalized multiplication as
823 * an example):
824 *
825 * - alpha plus one
826 *
827 * makes the following approximation to the division (Sree)
828 *
829 * a*b/255 ~= (a*(b + 1)) >> 256
830 *
831 * which is the fastest method that satisfies the following OpenGL criteria of
832 *
833 * 0*0 = 0 and 255*255 = 255
834 *
835 * - geometric series
836 *
837 * takes the geometric series approximation to the division
838 *
839 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
840 *
841 * in this case just the first two terms to fit in 16bit arithmetic
842 *
843 * t/255 ~= (t + (t >> 8)) >> 8
844 *
845 * note that just by itself it doesn't satisfies the OpenGL criteria, as
846 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
847 * must be used.
848 *
849 * - geometric series plus rounding
850 *
851 * when using a geometric series division instead of truncating the result
852 * use roundoff in the approximation (Jim Blinn)
853 *
854 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
855 *
856 * achieving the exact results.
857 *
858 *
859 *
860 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
861 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
862 * @sa Michael Herf, The "double blend trick", May 2000,
863 * http://www.stereopsis.com/doubleblend.html
864 */
865 static LLVMValueRef
866 lp_build_mul_norm(struct gallivm_state *gallivm,
867 struct lp_type wide_type,
868 LLVMValueRef a, LLVMValueRef b)
869 {
870 LLVMBuilderRef builder = gallivm->builder;
871 struct lp_build_context bld;
872 unsigned n;
873 LLVMValueRef half;
874 LLVMValueRef ab;
875
876 assert(!wide_type.floating);
877 assert(lp_check_value(wide_type, a));
878 assert(lp_check_value(wide_type, b));
879
880 lp_build_context_init(&bld, gallivm, wide_type);
881
882 n = wide_type.width / 2;
883 if (wide_type.sign) {
884 --n;
885 }
886
887 /*
888 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
889 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
890 */
891
892 /*
893 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
894 */
895
896 ab = LLVMBuildMul(builder, a, b, "");
897 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
898
899 /*
900 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
901 */
902
903 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
904 if (wide_type.sign) {
905 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
906 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
907 half = lp_build_select(&bld, sign, minus_half, half);
908 }
909 ab = LLVMBuildAdd(builder, ab, half, "");
910
911 /* Final division */
912 ab = lp_build_shr_imm(&bld, ab, n);
913
914 return ab;
915 }
916
917 /**
918 * Generate a * b
919 */
920 LLVMValueRef
921 lp_build_mul(struct lp_build_context *bld,
922 LLVMValueRef a,
923 LLVMValueRef b)
924 {
925 LLVMBuilderRef builder = bld->gallivm->builder;
926 const struct lp_type type = bld->type;
927 LLVMValueRef shift;
928 LLVMValueRef res;
929
930 assert(lp_check_value(type, a));
931 assert(lp_check_value(type, b));
932
933 if(a == bld->zero)
934 return bld->zero;
935 if(a == bld->one)
936 return b;
937 if(b == bld->zero)
938 return bld->zero;
939 if(b == bld->one)
940 return a;
941 if(a == bld->undef || b == bld->undef)
942 return bld->undef;
943
944 if (!type.floating && !type.fixed && type.norm) {
945 struct lp_type wide_type = lp_wider_type(type);
946 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
947
948 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
949 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
950
951 /* PMULLW, PSRLW, PADDW */
952 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
953 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
954
955 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
956
957 return ab;
958 }
959
960 if(type.fixed)
961 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
962 else
963 shift = NULL;
964
965 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
966 if (type.floating)
967 res = LLVMConstFMul(a, b);
968 else
969 res = LLVMConstMul(a, b);
970 if(shift) {
971 if(type.sign)
972 res = LLVMConstAShr(res, shift);
973 else
974 res = LLVMConstLShr(res, shift);
975 }
976 }
977 else {
978 if (type.floating)
979 res = LLVMBuildFMul(builder, a, b, "");
980 else
981 res = LLVMBuildMul(builder, a, b, "");
982 if(shift) {
983 if(type.sign)
984 res = LLVMBuildAShr(builder, res, shift, "");
985 else
986 res = LLVMBuildLShr(builder, res, shift, "");
987 }
988 }
989
990 return res;
991 }
992
993
994 /**
995 * Small vector x scale multiplication optimization.
996 */
997 LLVMValueRef
998 lp_build_mul_imm(struct lp_build_context *bld,
999 LLVMValueRef a,
1000 int b)
1001 {
1002 LLVMBuilderRef builder = bld->gallivm->builder;
1003 LLVMValueRef factor;
1004
1005 assert(lp_check_value(bld->type, a));
1006
1007 if(b == 0)
1008 return bld->zero;
1009
1010 if(b == 1)
1011 return a;
1012
1013 if(b == -1)
1014 return lp_build_negate(bld, a);
1015
1016 if(b == 2 && bld->type.floating)
1017 return lp_build_add(bld, a, a);
1018
1019 if(util_is_power_of_two(b)) {
1020 unsigned shift = ffs(b) - 1;
1021
1022 if(bld->type.floating) {
1023 #if 0
1024 /*
1025 * Power of two multiplication by directly manipulating the exponent.
1026 *
1027 * XXX: This might not be always faster, it will introduce a small error
1028 * for multiplication by zero, and it will produce wrong results
1029 * for Inf and NaN.
1030 */
1031 unsigned mantissa = lp_mantissa(bld->type);
1032 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1033 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1034 a = LLVMBuildAdd(builder, a, factor, "");
1035 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1036 return a;
1037 #endif
1038 }
1039 else {
1040 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1041 return LLVMBuildShl(builder, a, factor, "");
1042 }
1043 }
1044
1045 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1046 return lp_build_mul(bld, a, factor);
1047 }
1048
1049
1050 /**
1051 * Generate a / b
1052 */
1053 LLVMValueRef
1054 lp_build_div(struct lp_build_context *bld,
1055 LLVMValueRef a,
1056 LLVMValueRef b)
1057 {
1058 LLVMBuilderRef builder = bld->gallivm->builder;
1059 const struct lp_type type = bld->type;
1060
1061 assert(lp_check_value(type, a));
1062 assert(lp_check_value(type, b));
1063
1064 if(a == bld->zero)
1065 return bld->zero;
1066 if(a == bld->one)
1067 return lp_build_rcp(bld, b);
1068 if(b == bld->zero)
1069 return bld->undef;
1070 if(b == bld->one)
1071 return a;
1072 if(a == bld->undef || b == bld->undef)
1073 return bld->undef;
1074
1075 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1076 if (type.floating)
1077 return LLVMConstFDiv(a, b);
1078 else if (type.sign)
1079 return LLVMConstSDiv(a, b);
1080 else
1081 return LLVMConstUDiv(a, b);
1082 }
1083
1084 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1085 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1086 type.floating)
1087 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1088
1089 if (type.floating)
1090 return LLVMBuildFDiv(builder, a, b, "");
1091 else if (type.sign)
1092 return LLVMBuildSDiv(builder, a, b, "");
1093 else
1094 return LLVMBuildUDiv(builder, a, b, "");
1095 }
1096
1097
1098 /**
1099 * Linear interpolation helper.
1100 *
1101 * @param normalized whether we are interpolating normalized values,
1102 * encoded in normalized integers, twice as wide.
1103 *
1104 * @sa http://www.stereopsis.com/doubleblend.html
1105 */
1106 static INLINE LLVMValueRef
1107 lp_build_lerp_simple(struct lp_build_context *bld,
1108 LLVMValueRef x,
1109 LLVMValueRef v0,
1110 LLVMValueRef v1,
1111 unsigned flags)
1112 {
1113 unsigned half_width = bld->type.width/2;
1114 LLVMBuilderRef builder = bld->gallivm->builder;
1115 LLVMValueRef delta;
1116 LLVMValueRef res;
1117
1118 assert(lp_check_value(bld->type, x));
1119 assert(lp_check_value(bld->type, v0));
1120 assert(lp_check_value(bld->type, v1));
1121
1122 delta = lp_build_sub(bld, v1, v0);
1123
1124 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1125 if (!bld->type.sign) {
1126 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1127 /*
1128 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1129 * most-significant-bit to the lowest-significant-bit, so that
1130 * later we can just divide by 2**n instead of 2**n - 1.
1131 */
1132
1133 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1134 }
1135
1136 /* (x * delta) >> n */
1137 res = lp_build_mul(bld, x, delta);
1138 res = lp_build_shr_imm(bld, res, half_width);
1139 } else {
1140 /*
1141 * The rescaling trick above doesn't work for signed numbers, so
1142 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1143 * instead.
1144 */
1145 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1146 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1147 }
1148 } else {
1149 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1150 res = lp_build_mul(bld, x, delta);
1151 }
1152
1153 res = lp_build_add(bld, v0, res);
1154
1155 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1156 bld->type.fixed) {
1157 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1158 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1159 * but it will be wrong for true fixed point use cases. Basically we need
1160 * a more powerful lp_type, capable of further distinguishing the values
1161 * interpretation from the value storage. */
1162 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1163 }
1164
1165 return res;
1166 }
1167
1168
1169 /**
1170 * Linear interpolation.
1171 */
1172 LLVMValueRef
1173 lp_build_lerp(struct lp_build_context *bld,
1174 LLVMValueRef x,
1175 LLVMValueRef v0,
1176 LLVMValueRef v1,
1177 unsigned flags)
1178 {
1179 const struct lp_type type = bld->type;
1180 LLVMValueRef res;
1181
1182 assert(lp_check_value(type, x));
1183 assert(lp_check_value(type, v0));
1184 assert(lp_check_value(type, v1));
1185
1186 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1187
1188 if (type.norm) {
1189 struct lp_type wide_type;
1190 struct lp_build_context wide_bld;
1191 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1192
1193 assert(type.length >= 2);
1194
1195 /*
1196 * Create a wider integer type, enough to hold the
1197 * intermediate result of the multiplication.
1198 */
1199 memset(&wide_type, 0, sizeof wide_type);
1200 wide_type.sign = type.sign;
1201 wide_type.width = type.width*2;
1202 wide_type.length = type.length/2;
1203
1204 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1205
1206 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1207 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1208 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1209
1210 /*
1211 * Lerp both halves.
1212 */
1213
1214 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1215
1216 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1217 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1218
1219 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1220 } else {
1221 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1222 }
1223
1224 return res;
1225 }
1226
1227
1228 /**
1229 * Bilinear interpolation.
1230 *
1231 * Values indices are in v_{yx}.
1232 */
1233 LLVMValueRef
1234 lp_build_lerp_2d(struct lp_build_context *bld,
1235 LLVMValueRef x,
1236 LLVMValueRef y,
1237 LLVMValueRef v00,
1238 LLVMValueRef v01,
1239 LLVMValueRef v10,
1240 LLVMValueRef v11,
1241 unsigned flags)
1242 {
1243 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1244 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1245 return lp_build_lerp(bld, y, v0, v1, flags);
1246 }
1247
1248
1249 LLVMValueRef
1250 lp_build_lerp_3d(struct lp_build_context *bld,
1251 LLVMValueRef x,
1252 LLVMValueRef y,
1253 LLVMValueRef z,
1254 LLVMValueRef v000,
1255 LLVMValueRef v001,
1256 LLVMValueRef v010,
1257 LLVMValueRef v011,
1258 LLVMValueRef v100,
1259 LLVMValueRef v101,
1260 LLVMValueRef v110,
1261 LLVMValueRef v111,
1262 unsigned flags)
1263 {
1264 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1265 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1266 return lp_build_lerp(bld, z, v0, v1, flags);
1267 }
1268
1269
1270 /**
1271 * Generate min(a, b)
1272 * Do checks for special cases but not for nans.
1273 */
1274 LLVMValueRef
1275 lp_build_min(struct lp_build_context *bld,
1276 LLVMValueRef a,
1277 LLVMValueRef b)
1278 {
1279 assert(lp_check_value(bld->type, a));
1280 assert(lp_check_value(bld->type, b));
1281
1282 if(a == bld->undef || b == bld->undef)
1283 return bld->undef;
1284
1285 if(a == b)
1286 return a;
1287
1288 if (bld->type.norm) {
1289 if (!bld->type.sign) {
1290 if (a == bld->zero || b == bld->zero) {
1291 return bld->zero;
1292 }
1293 }
1294 if(a == bld->one)
1295 return b;
1296 if(b == bld->one)
1297 return a;
1298 }
1299
1300 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1301 }
1302
1303
1304 /**
1305 * Generate min(a, b)
1306 * NaN's are handled according to the behavior specified by the
1307 * nan_behavior argument.
1308 */
1309 LLVMValueRef
1310 lp_build_min_ext(struct lp_build_context *bld,
1311 LLVMValueRef a,
1312 LLVMValueRef b,
1313 enum gallivm_nan_behavior nan_behavior)
1314 {
1315 assert(lp_check_value(bld->type, a));
1316 assert(lp_check_value(bld->type, b));
1317
1318 if(a == bld->undef || b == bld->undef)
1319 return bld->undef;
1320
1321 if(a == b)
1322 return a;
1323
1324 if (bld->type.norm) {
1325 if (!bld->type.sign) {
1326 if (a == bld->zero || b == bld->zero) {
1327 return bld->zero;
1328 }
1329 }
1330 if(a == bld->one)
1331 return b;
1332 if(b == bld->one)
1333 return a;
1334 }
1335
1336 return lp_build_min_simple(bld, a, b, nan_behavior);
1337 }
1338
1339 /**
1340 * Generate max(a, b)
1341 * Do checks for special cases, but NaN behavior is undefined.
1342 */
1343 LLVMValueRef
1344 lp_build_max(struct lp_build_context *bld,
1345 LLVMValueRef a,
1346 LLVMValueRef b)
1347 {
1348 assert(lp_check_value(bld->type, a));
1349 assert(lp_check_value(bld->type, b));
1350
1351 if(a == bld->undef || b == bld->undef)
1352 return bld->undef;
1353
1354 if(a == b)
1355 return a;
1356
1357 if(bld->type.norm) {
1358 if(a == bld->one || b == bld->one)
1359 return bld->one;
1360 if (!bld->type.sign) {
1361 if (a == bld->zero) {
1362 return b;
1363 }
1364 if (b == bld->zero) {
1365 return a;
1366 }
1367 }
1368 }
1369
1370 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1371 }
1372
1373
1374 /**
1375 * Generate max(a, b)
1376 * Checks for special cases.
1377 * NaN's are handled according to the behavior specified by the
1378 * nan_behavior argument.
1379 */
1380 LLVMValueRef
1381 lp_build_max_ext(struct lp_build_context *bld,
1382 LLVMValueRef a,
1383 LLVMValueRef b,
1384 enum gallivm_nan_behavior nan_behavior)
1385 {
1386 assert(lp_check_value(bld->type, a));
1387 assert(lp_check_value(bld->type, b));
1388
1389 if(a == bld->undef || b == bld->undef)
1390 return bld->undef;
1391
1392 if(a == b)
1393 return a;
1394
1395 if(bld->type.norm) {
1396 if(a == bld->one || b == bld->one)
1397 return bld->one;
1398 if (!bld->type.sign) {
1399 if (a == bld->zero) {
1400 return b;
1401 }
1402 if (b == bld->zero) {
1403 return a;
1404 }
1405 }
1406 }
1407
1408 return lp_build_max_simple(bld, a, b, nan_behavior);
1409 }
1410
1411 /**
1412 * Generate clamp(a, min, max)
1413 * NaN behavior (for any of a, min, max) is undefined.
1414 * Do checks for special cases.
1415 */
1416 LLVMValueRef
1417 lp_build_clamp(struct lp_build_context *bld,
1418 LLVMValueRef a,
1419 LLVMValueRef min,
1420 LLVMValueRef max)
1421 {
1422 assert(lp_check_value(bld->type, a));
1423 assert(lp_check_value(bld->type, min));
1424 assert(lp_check_value(bld->type, max));
1425
1426 a = lp_build_min(bld, a, max);
1427 a = lp_build_max(bld, a, min);
1428 return a;
1429 }
1430
1431
1432 /**
1433 * Generate clamp(a, 0, 1)
1434 * A NaN will get converted to zero.
1435 */
1436 LLVMValueRef
1437 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1438 LLVMValueRef a)
1439 {
1440 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1441 a = lp_build_min(bld, a, bld->one);
1442 return a;
1443 }
1444
1445
1446 /**
1447 * Generate abs(a)
1448 */
1449 LLVMValueRef
1450 lp_build_abs(struct lp_build_context *bld,
1451 LLVMValueRef a)
1452 {
1453 LLVMBuilderRef builder = bld->gallivm->builder;
1454 const struct lp_type type = bld->type;
1455 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1456
1457 assert(lp_check_value(type, a));
1458
1459 if(!type.sign)
1460 return a;
1461
1462 if(type.floating) {
1463 /* Mask out the sign bit */
1464 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1465 unsigned long long absMask = ~(1ULL << (type.width - 1));
1466 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1467 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1468 a = LLVMBuildAnd(builder, a, mask, "");
1469 a = LLVMBuildBitCast(builder, a, vec_type, "");
1470 return a;
1471 }
1472
1473 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1474 switch(type.width) {
1475 case 8:
1476 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1477 case 16:
1478 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1479 case 32:
1480 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1481 }
1482 }
1483 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1484 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1485 (type.width == 8 || type.width == 16 || type.width == 32)) {
1486 debug_printf("%s: inefficient code, should split vectors manually\n",
1487 __FUNCTION__);
1488 }
1489
1490 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1491 }
1492
1493
1494 LLVMValueRef
1495 lp_build_negate(struct lp_build_context *bld,
1496 LLVMValueRef a)
1497 {
1498 LLVMBuilderRef builder = bld->gallivm->builder;
1499
1500 assert(lp_check_value(bld->type, a));
1501
1502 if (bld->type.floating)
1503 a = LLVMBuildFNeg(builder, a, "");
1504 else
1505 a = LLVMBuildNeg(builder, a, "");
1506
1507 return a;
1508 }
1509
1510
1511 /** Return -1, 0 or +1 depending on the sign of a */
1512 LLVMValueRef
1513 lp_build_sgn(struct lp_build_context *bld,
1514 LLVMValueRef a)
1515 {
1516 LLVMBuilderRef builder = bld->gallivm->builder;
1517 const struct lp_type type = bld->type;
1518 LLVMValueRef cond;
1519 LLVMValueRef res;
1520
1521 assert(lp_check_value(type, a));
1522
1523 /* Handle non-zero case */
1524 if(!type.sign) {
1525 /* if not zero then sign must be positive */
1526 res = bld->one;
1527 }
1528 else if(type.floating) {
1529 LLVMTypeRef vec_type;
1530 LLVMTypeRef int_type;
1531 LLVMValueRef mask;
1532 LLVMValueRef sign;
1533 LLVMValueRef one;
1534 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1535
1536 int_type = lp_build_int_vec_type(bld->gallivm, type);
1537 vec_type = lp_build_vec_type(bld->gallivm, type);
1538 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1539
1540 /* Take the sign bit and add it to 1 constant */
1541 sign = LLVMBuildBitCast(builder, a, int_type, "");
1542 sign = LLVMBuildAnd(builder, sign, mask, "");
1543 one = LLVMConstBitCast(bld->one, int_type);
1544 res = LLVMBuildOr(builder, sign, one, "");
1545 res = LLVMBuildBitCast(builder, res, vec_type, "");
1546 }
1547 else
1548 {
1549 /* signed int/norm/fixed point */
1550 /* could use psign with sse3 and appropriate vectors here */
1551 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1552 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1553 res = lp_build_select(bld, cond, bld->one, minus_one);
1554 }
1555
1556 /* Handle zero */
1557 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1558 res = lp_build_select(bld, cond, bld->zero, res);
1559
1560 return res;
1561 }
1562
1563
1564 /**
1565 * Set the sign of float vector 'a' according to 'sign'.
1566 * If sign==0, return abs(a).
1567 * If sign==1, return -abs(a);
1568 * Other values for sign produce undefined results.
1569 */
1570 LLVMValueRef
1571 lp_build_set_sign(struct lp_build_context *bld,
1572 LLVMValueRef a, LLVMValueRef sign)
1573 {
1574 LLVMBuilderRef builder = bld->gallivm->builder;
1575 const struct lp_type type = bld->type;
1576 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1577 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1578 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1579 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1580 ~((unsigned long long) 1 << (type.width - 1)));
1581 LLVMValueRef val, res;
1582
1583 assert(type.floating);
1584 assert(lp_check_value(type, a));
1585
1586 /* val = reinterpret_cast<int>(a) */
1587 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1588 /* val = val & mask */
1589 val = LLVMBuildAnd(builder, val, mask, "");
1590 /* sign = sign << shift */
1591 sign = LLVMBuildShl(builder, sign, shift, "");
1592 /* res = val | sign */
1593 res = LLVMBuildOr(builder, val, sign, "");
1594 /* res = reinterpret_cast<float>(res) */
1595 res = LLVMBuildBitCast(builder, res, vec_type, "");
1596
1597 return res;
1598 }
1599
1600
1601 /**
1602 * Convert vector of (or scalar) int to vector of (or scalar) float.
1603 */
1604 LLVMValueRef
1605 lp_build_int_to_float(struct lp_build_context *bld,
1606 LLVMValueRef a)
1607 {
1608 LLVMBuilderRef builder = bld->gallivm->builder;
1609 const struct lp_type type = bld->type;
1610 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1611
1612 assert(type.floating);
1613
1614 return LLVMBuildSIToFP(builder, a, vec_type, "");
1615 }
1616
1617 static boolean
1618 arch_rounding_available(const struct lp_type type)
1619 {
1620 if ((util_cpu_caps.has_sse4_1 &&
1621 (type.length == 1 || type.width*type.length == 128)) ||
1622 (util_cpu_caps.has_avx && type.width*type.length == 256))
1623 return TRUE;
1624 else if ((util_cpu_caps.has_altivec &&
1625 (type.width == 32 && type.length == 4)))
1626 return TRUE;
1627
1628 return FALSE;
1629 }
1630
1631 enum lp_build_round_mode
1632 {
1633 LP_BUILD_ROUND_NEAREST = 0,
1634 LP_BUILD_ROUND_FLOOR = 1,
1635 LP_BUILD_ROUND_CEIL = 2,
1636 LP_BUILD_ROUND_TRUNCATE = 3
1637 };
1638
1639 /**
1640 * Helper for SSE4.1's ROUNDxx instructions.
1641 *
1642 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1643 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1644 */
1645 static INLINE LLVMValueRef
1646 lp_build_round_sse41(struct lp_build_context *bld,
1647 LLVMValueRef a,
1648 enum lp_build_round_mode mode)
1649 {
1650 LLVMBuilderRef builder = bld->gallivm->builder;
1651 const struct lp_type type = bld->type;
1652 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1653 const char *intrinsic;
1654 LLVMValueRef res;
1655
1656 assert(type.floating);
1657
1658 assert(lp_check_value(type, a));
1659 assert(util_cpu_caps.has_sse4_1);
1660
1661 if (type.length == 1) {
1662 LLVMTypeRef vec_type;
1663 LLVMValueRef undef;
1664 LLVMValueRef args[3];
1665 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1666
1667 switch(type.width) {
1668 case 32:
1669 intrinsic = "llvm.x86.sse41.round.ss";
1670 break;
1671 case 64:
1672 intrinsic = "llvm.x86.sse41.round.sd";
1673 break;
1674 default:
1675 assert(0);
1676 return bld->undef;
1677 }
1678
1679 vec_type = LLVMVectorType(bld->elem_type, 4);
1680
1681 undef = LLVMGetUndef(vec_type);
1682
1683 args[0] = undef;
1684 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1685 args[2] = LLVMConstInt(i32t, mode, 0);
1686
1687 res = lp_build_intrinsic(builder, intrinsic,
1688 vec_type, args, Elements(args));
1689
1690 res = LLVMBuildExtractElement(builder, res, index0, "");
1691 }
1692 else {
1693 if (type.width * type.length == 128) {
1694 switch(type.width) {
1695 case 32:
1696 intrinsic = "llvm.x86.sse41.round.ps";
1697 break;
1698 case 64:
1699 intrinsic = "llvm.x86.sse41.round.pd";
1700 break;
1701 default:
1702 assert(0);
1703 return bld->undef;
1704 }
1705 }
1706 else {
1707 assert(type.width * type.length == 256);
1708 assert(util_cpu_caps.has_avx);
1709
1710 switch(type.width) {
1711 case 32:
1712 intrinsic = "llvm.x86.avx.round.ps.256";
1713 break;
1714 case 64:
1715 intrinsic = "llvm.x86.avx.round.pd.256";
1716 break;
1717 default:
1718 assert(0);
1719 return bld->undef;
1720 }
1721 }
1722
1723 res = lp_build_intrinsic_binary(builder, intrinsic,
1724 bld->vec_type, a,
1725 LLVMConstInt(i32t, mode, 0));
1726 }
1727
1728 return res;
1729 }
1730
1731
1732 static INLINE LLVMValueRef
1733 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1734 LLVMValueRef a)
1735 {
1736 LLVMBuilderRef builder = bld->gallivm->builder;
1737 const struct lp_type type = bld->type;
1738 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1739 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1740 const char *intrinsic;
1741 LLVMValueRef res;
1742
1743 assert(type.floating);
1744 /* using the double precision conversions is a bit more complicated */
1745 assert(type.width == 32);
1746
1747 assert(lp_check_value(type, a));
1748 assert(util_cpu_caps.has_sse2);
1749
1750 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1751 if (type.length == 1) {
1752 LLVMTypeRef vec_type;
1753 LLVMValueRef undef;
1754 LLVMValueRef arg;
1755 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1756
1757 vec_type = LLVMVectorType(bld->elem_type, 4);
1758
1759 intrinsic = "llvm.x86.sse.cvtss2si";
1760
1761 undef = LLVMGetUndef(vec_type);
1762
1763 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1764
1765 res = lp_build_intrinsic_unary(builder, intrinsic,
1766 ret_type, arg);
1767 }
1768 else {
1769 if (type.width* type.length == 128) {
1770 intrinsic = "llvm.x86.sse2.cvtps2dq";
1771 }
1772 else {
1773 assert(type.width*type.length == 256);
1774 assert(util_cpu_caps.has_avx);
1775
1776 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1777 }
1778 res = lp_build_intrinsic_unary(builder, intrinsic,
1779 ret_type, a);
1780 }
1781
1782 return res;
1783 }
1784
1785
1786 /*
1787 */
1788 static INLINE LLVMValueRef
1789 lp_build_round_altivec(struct lp_build_context *bld,
1790 LLVMValueRef a,
1791 enum lp_build_round_mode mode)
1792 {
1793 LLVMBuilderRef builder = bld->gallivm->builder;
1794 const struct lp_type type = bld->type;
1795 const char *intrinsic = NULL;
1796
1797 assert(type.floating);
1798
1799 assert(lp_check_value(type, a));
1800 assert(util_cpu_caps.has_altivec);
1801
1802 switch (mode) {
1803 case LP_BUILD_ROUND_NEAREST:
1804 intrinsic = "llvm.ppc.altivec.vrfin";
1805 break;
1806 case LP_BUILD_ROUND_FLOOR:
1807 intrinsic = "llvm.ppc.altivec.vrfim";
1808 break;
1809 case LP_BUILD_ROUND_CEIL:
1810 intrinsic = "llvm.ppc.altivec.vrfip";
1811 break;
1812 case LP_BUILD_ROUND_TRUNCATE:
1813 intrinsic = "llvm.ppc.altivec.vrfiz";
1814 break;
1815 }
1816
1817 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1818 }
1819
1820 static INLINE LLVMValueRef
1821 lp_build_round_arch(struct lp_build_context *bld,
1822 LLVMValueRef a,
1823 enum lp_build_round_mode mode)
1824 {
1825 if (util_cpu_caps.has_sse4_1)
1826 return lp_build_round_sse41(bld, a, mode);
1827 else /* (util_cpu_caps.has_altivec) */
1828 return lp_build_round_altivec(bld, a, mode);
1829 }
1830
1831 /**
1832 * Return the integer part of a float (vector) value (== round toward zero).
1833 * The returned value is a float (vector).
1834 * Ex: trunc(-1.5) = -1.0
1835 */
1836 LLVMValueRef
1837 lp_build_trunc(struct lp_build_context *bld,
1838 LLVMValueRef a)
1839 {
1840 LLVMBuilderRef builder = bld->gallivm->builder;
1841 const struct lp_type type = bld->type;
1842
1843 assert(type.floating);
1844 assert(lp_check_value(type, a));
1845
1846 if (arch_rounding_available(type)) {
1847 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1848 }
1849 else {
1850 const struct lp_type type = bld->type;
1851 struct lp_type inttype;
1852 struct lp_build_context intbld;
1853 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1854 LLVMValueRef trunc, res, anosign, mask;
1855 LLVMTypeRef int_vec_type = bld->int_vec_type;
1856 LLVMTypeRef vec_type = bld->vec_type;
1857
1858 assert(type.width == 32); /* might want to handle doubles at some point */
1859
1860 inttype = type;
1861 inttype.floating = 0;
1862 lp_build_context_init(&intbld, bld->gallivm, inttype);
1863
1864 /* round by truncation */
1865 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1866 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1867
1868 /* mask out sign bit */
1869 anosign = lp_build_abs(bld, a);
1870 /*
1871 * mask out all values if anosign > 2^24
1872 * This should work both for large ints (all rounding is no-op for them
1873 * because such floats are always exact) as well as special cases like
1874 * NaNs, Infs (taking advantage of the fact they use max exponent).
1875 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1876 */
1877 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1878 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1879 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1880 return lp_build_select(bld, mask, a, res);
1881 }
1882 }
1883
1884
1885 /**
1886 * Return float (vector) rounded to nearest integer (vector). The returned
1887 * value is a float (vector).
1888 * Ex: round(0.9) = 1.0
1889 * Ex: round(-1.5) = -2.0
1890 */
1891 LLVMValueRef
1892 lp_build_round(struct lp_build_context *bld,
1893 LLVMValueRef a)
1894 {
1895 LLVMBuilderRef builder = bld->gallivm->builder;
1896 const struct lp_type type = bld->type;
1897
1898 assert(type.floating);
1899 assert(lp_check_value(type, a));
1900
1901 if (arch_rounding_available(type)) {
1902 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1903 }
1904 else {
1905 const struct lp_type type = bld->type;
1906 struct lp_type inttype;
1907 struct lp_build_context intbld;
1908 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1909 LLVMValueRef res, anosign, mask;
1910 LLVMTypeRef int_vec_type = bld->int_vec_type;
1911 LLVMTypeRef vec_type = bld->vec_type;
1912
1913 assert(type.width == 32); /* might want to handle doubles at some point */
1914
1915 inttype = type;
1916 inttype.floating = 0;
1917 lp_build_context_init(&intbld, bld->gallivm, inttype);
1918
1919 res = lp_build_iround(bld, a);
1920 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1921
1922 /* mask out sign bit */
1923 anosign = lp_build_abs(bld, a);
1924 /*
1925 * mask out all values if anosign > 2^24
1926 * This should work both for large ints (all rounding is no-op for them
1927 * because such floats are always exact) as well as special cases like
1928 * NaNs, Infs (taking advantage of the fact they use max exponent).
1929 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1930 */
1931 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1932 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1933 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1934 return lp_build_select(bld, mask, a, res);
1935 }
1936 }
1937
1938
1939 /**
1940 * Return floor of float (vector), result is a float (vector)
1941 * Ex: floor(1.1) = 1.0
1942 * Ex: floor(-1.1) = -2.0
1943 */
1944 LLVMValueRef
1945 lp_build_floor(struct lp_build_context *bld,
1946 LLVMValueRef a)
1947 {
1948 LLVMBuilderRef builder = bld->gallivm->builder;
1949 const struct lp_type type = bld->type;
1950
1951 assert(type.floating);
1952 assert(lp_check_value(type, a));
1953
1954 if (arch_rounding_available(type)) {
1955 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1956 }
1957 else {
1958 const struct lp_type type = bld->type;
1959 struct lp_type inttype;
1960 struct lp_build_context intbld;
1961 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1962 LLVMValueRef trunc, res, anosign, mask;
1963 LLVMTypeRef int_vec_type = bld->int_vec_type;
1964 LLVMTypeRef vec_type = bld->vec_type;
1965
1966 assert(type.width == 32); /* might want to handle doubles at some point */
1967
1968 inttype = type;
1969 inttype.floating = 0;
1970 lp_build_context_init(&intbld, bld->gallivm, inttype);
1971
1972 /* round by truncation */
1973 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1974 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1975
1976 if (type.sign) {
1977 LLVMValueRef tmp;
1978
1979 /*
1980 * fix values if rounding is wrong (for non-special cases)
1981 * - this is the case if trunc > a
1982 */
1983 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1984 /* tmp = trunc > a ? 1.0 : 0.0 */
1985 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1986 tmp = lp_build_and(&intbld, mask, tmp);
1987 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1988 res = lp_build_sub(bld, res, tmp);
1989 }
1990
1991 /* mask out sign bit */
1992 anosign = lp_build_abs(bld, a);
1993 /*
1994 * mask out all values if anosign > 2^24
1995 * This should work both for large ints (all rounding is no-op for them
1996 * because such floats are always exact) as well as special cases like
1997 * NaNs, Infs (taking advantage of the fact they use max exponent).
1998 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1999 */
2000 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2001 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2002 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2003 return lp_build_select(bld, mask, a, res);
2004 }
2005 }
2006
2007
2008 /**
2009 * Return ceiling of float (vector), returning float (vector).
2010 * Ex: ceil( 1.1) = 2.0
2011 * Ex: ceil(-1.1) = -1.0
2012 */
2013 LLVMValueRef
2014 lp_build_ceil(struct lp_build_context *bld,
2015 LLVMValueRef a)
2016 {
2017 LLVMBuilderRef builder = bld->gallivm->builder;
2018 const struct lp_type type = bld->type;
2019
2020 assert(type.floating);
2021 assert(lp_check_value(type, a));
2022
2023 if (arch_rounding_available(type)) {
2024 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2025 }
2026 else {
2027 const struct lp_type type = bld->type;
2028 struct lp_type inttype;
2029 struct lp_build_context intbld;
2030 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
2031 LLVMValueRef trunc, res, anosign, mask, tmp;
2032 LLVMTypeRef int_vec_type = bld->int_vec_type;
2033 LLVMTypeRef vec_type = bld->vec_type;
2034
2035 assert(type.width == 32); /* might want to handle doubles at some point */
2036
2037 inttype = type;
2038 inttype.floating = 0;
2039 lp_build_context_init(&intbld, bld->gallivm, inttype);
2040
2041 /* round by truncation */
2042 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2043 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2044
2045 /*
2046 * fix values if rounding is wrong (for non-special cases)
2047 * - this is the case if trunc < a
2048 */
2049 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2050 /* tmp = trunc < a ? 1.0 : 0.0 */
2051 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2052 tmp = lp_build_and(&intbld, mask, tmp);
2053 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2054 res = lp_build_add(bld, trunc, tmp);
2055
2056 /* mask out sign bit */
2057 anosign = lp_build_abs(bld, a);
2058 /*
2059 * mask out all values if anosign > 2^24
2060 * This should work both for large ints (all rounding is no-op for them
2061 * because such floats are always exact) as well as special cases like
2062 * NaNs, Infs (taking advantage of the fact they use max exponent).
2063 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2064 */
2065 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2066 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2067 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2068 return lp_build_select(bld, mask, a, res);
2069 }
2070 }
2071
2072
2073 /**
2074 * Return fractional part of 'a' computed as a - floor(a)
2075 * Typically used in texture coord arithmetic.
2076 */
2077 LLVMValueRef
2078 lp_build_fract(struct lp_build_context *bld,
2079 LLVMValueRef a)
2080 {
2081 assert(bld->type.floating);
2082 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2083 }
2084
2085
2086 /**
2087 * Prevent returning a fractional part of 1.0 for very small negative values of
2088 * 'a' by clamping against 0.99999(9).
2089 */
2090 static inline LLVMValueRef
2091 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2092 {
2093 LLVMValueRef max;
2094
2095 /* this is the largest number smaller than 1.0 representable as float */
2096 max = lp_build_const_vec(bld->gallivm, bld->type,
2097 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2098 return lp_build_min(bld, fract, max);
2099 }
2100
2101
2102 /**
2103 * Same as lp_build_fract, but guarantees that the result is always smaller
2104 * than one.
2105 */
2106 LLVMValueRef
2107 lp_build_fract_safe(struct lp_build_context *bld,
2108 LLVMValueRef a)
2109 {
2110 return clamp_fract(bld, lp_build_fract(bld, a));
2111 }
2112
2113
2114 /**
2115 * Return the integer part of a float (vector) value (== round toward zero).
2116 * The returned value is an integer (vector).
2117 * Ex: itrunc(-1.5) = -1
2118 */
2119 LLVMValueRef
2120 lp_build_itrunc(struct lp_build_context *bld,
2121 LLVMValueRef a)
2122 {
2123 LLVMBuilderRef builder = bld->gallivm->builder;
2124 const struct lp_type type = bld->type;
2125 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2126
2127 assert(type.floating);
2128 assert(lp_check_value(type, a));
2129
2130 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2131 }
2132
2133
2134 /**
2135 * Return float (vector) rounded to nearest integer (vector). The returned
2136 * value is an integer (vector).
2137 * Ex: iround(0.9) = 1
2138 * Ex: iround(-1.5) = -2
2139 */
2140 LLVMValueRef
2141 lp_build_iround(struct lp_build_context *bld,
2142 LLVMValueRef a)
2143 {
2144 LLVMBuilderRef builder = bld->gallivm->builder;
2145 const struct lp_type type = bld->type;
2146 LLVMTypeRef int_vec_type = bld->int_vec_type;
2147 LLVMValueRef res;
2148
2149 assert(type.floating);
2150
2151 assert(lp_check_value(type, a));
2152
2153 if ((util_cpu_caps.has_sse2 &&
2154 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2155 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2156 return lp_build_iround_nearest_sse2(bld, a);
2157 }
2158 if (arch_rounding_available(type)) {
2159 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2160 }
2161 else {
2162 LLVMValueRef half;
2163
2164 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2165
2166 if (type.sign) {
2167 LLVMTypeRef vec_type = bld->vec_type;
2168 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2169 (unsigned long long)1 << (type.width - 1));
2170 LLVMValueRef sign;
2171
2172 /* get sign bit */
2173 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2174 sign = LLVMBuildAnd(builder, sign, mask, "");
2175
2176 /* sign * 0.5 */
2177 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2178 half = LLVMBuildOr(builder, sign, half, "");
2179 half = LLVMBuildBitCast(builder, half, vec_type, "");
2180 }
2181
2182 res = LLVMBuildFAdd(builder, a, half, "");
2183 }
2184
2185 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2186
2187 return res;
2188 }
2189
2190
2191 /**
2192 * Return floor of float (vector), result is an int (vector)
2193 * Ex: ifloor(1.1) = 1.0
2194 * Ex: ifloor(-1.1) = -2.0
2195 */
2196 LLVMValueRef
2197 lp_build_ifloor(struct lp_build_context *bld,
2198 LLVMValueRef a)
2199 {
2200 LLVMBuilderRef builder = bld->gallivm->builder;
2201 const struct lp_type type = bld->type;
2202 LLVMTypeRef int_vec_type = bld->int_vec_type;
2203 LLVMValueRef res;
2204
2205 assert(type.floating);
2206 assert(lp_check_value(type, a));
2207
2208 res = a;
2209 if (type.sign) {
2210 if (arch_rounding_available(type)) {
2211 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2212 }
2213 else {
2214 struct lp_type inttype;
2215 struct lp_build_context intbld;
2216 LLVMValueRef trunc, itrunc, mask;
2217
2218 assert(type.floating);
2219 assert(lp_check_value(type, a));
2220
2221 inttype = type;
2222 inttype.floating = 0;
2223 lp_build_context_init(&intbld, bld->gallivm, inttype);
2224
2225 /* round by truncation */
2226 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2227 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2228
2229 /*
2230 * fix values if rounding is wrong (for non-special cases)
2231 * - this is the case if trunc > a
2232 * The results of doing this with NaNs, very large values etc.
2233 * are undefined but this seems to be the case anyway.
2234 */
2235 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2236 /* cheapie minus one with mask since the mask is minus one / zero */
2237 return lp_build_add(&intbld, itrunc, mask);
2238 }
2239 }
2240
2241 /* round to nearest (toward zero) */
2242 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2243
2244 return res;
2245 }
2246
2247
2248 /**
2249 * Return ceiling of float (vector), returning int (vector).
2250 * Ex: iceil( 1.1) = 2
2251 * Ex: iceil(-1.1) = -1
2252 */
2253 LLVMValueRef
2254 lp_build_iceil(struct lp_build_context *bld,
2255 LLVMValueRef a)
2256 {
2257 LLVMBuilderRef builder = bld->gallivm->builder;
2258 const struct lp_type type = bld->type;
2259 LLVMTypeRef int_vec_type = bld->int_vec_type;
2260 LLVMValueRef res;
2261
2262 assert(type.floating);
2263 assert(lp_check_value(type, a));
2264
2265 if (arch_rounding_available(type)) {
2266 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2267 }
2268 else {
2269 struct lp_type inttype;
2270 struct lp_build_context intbld;
2271 LLVMValueRef trunc, itrunc, mask;
2272
2273 assert(type.floating);
2274 assert(lp_check_value(type, a));
2275
2276 inttype = type;
2277 inttype.floating = 0;
2278 lp_build_context_init(&intbld, bld->gallivm, inttype);
2279
2280 /* round by truncation */
2281 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2282 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2283
2284 /*
2285 * fix values if rounding is wrong (for non-special cases)
2286 * - this is the case if trunc < a
2287 * The results of doing this with NaNs, very large values etc.
2288 * are undefined but this seems to be the case anyway.
2289 */
2290 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2291 /* cheapie plus one with mask since the mask is minus one / zero */
2292 return lp_build_sub(&intbld, itrunc, mask);
2293 }
2294
2295 /* round to nearest (toward zero) */
2296 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2297
2298 return res;
2299 }
2300
2301
2302 /**
2303 * Combined ifloor() & fract().
2304 *
2305 * Preferred to calling the functions separately, as it will ensure that the
2306 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2307 */
2308 void
2309 lp_build_ifloor_fract(struct lp_build_context *bld,
2310 LLVMValueRef a,
2311 LLVMValueRef *out_ipart,
2312 LLVMValueRef *out_fpart)
2313 {
2314 LLVMBuilderRef builder = bld->gallivm->builder;
2315 const struct lp_type type = bld->type;
2316 LLVMValueRef ipart;
2317
2318 assert(type.floating);
2319 assert(lp_check_value(type, a));
2320
2321 if (arch_rounding_available(type)) {
2322 /*
2323 * floor() is easier.
2324 */
2325
2326 ipart = lp_build_floor(bld, a);
2327 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2328 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2329 }
2330 else {
2331 /*
2332 * ifloor() is easier.
2333 */
2334
2335 *out_ipart = lp_build_ifloor(bld, a);
2336 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2337 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2338 }
2339 }
2340
2341
2342 /**
2343 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2344 * always smaller than one.
2345 */
2346 void
2347 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2348 LLVMValueRef a,
2349 LLVMValueRef *out_ipart,
2350 LLVMValueRef *out_fpart)
2351 {
2352 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2353 *out_fpart = clamp_fract(bld, *out_fpart);
2354 }
2355
2356
2357 LLVMValueRef
2358 lp_build_sqrt(struct lp_build_context *bld,
2359 LLVMValueRef a)
2360 {
2361 LLVMBuilderRef builder = bld->gallivm->builder;
2362 const struct lp_type type = bld->type;
2363 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2364 char intrinsic[32];
2365
2366 assert(lp_check_value(type, a));
2367
2368 /* TODO: optimize the constant case */
2369
2370 assert(type.floating);
2371 if (type.length == 1) {
2372 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2373 }
2374 else {
2375 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2376 }
2377
2378 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2379 }
2380
2381
2382 /**
2383 * Do one Newton-Raphson step to improve reciprocate precision:
2384 *
2385 * x_{i+1} = x_i * (2 - a * x_i)
2386 *
2387 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2388 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2389 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2390 * halo. It would be necessary to clamp the argument to prevent this.
2391 *
2392 * See also:
2393 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2394 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2395 */
2396 static INLINE LLVMValueRef
2397 lp_build_rcp_refine(struct lp_build_context *bld,
2398 LLVMValueRef a,
2399 LLVMValueRef rcp_a)
2400 {
2401 LLVMBuilderRef builder = bld->gallivm->builder;
2402 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2403 LLVMValueRef res;
2404
2405 res = LLVMBuildFMul(builder, a, rcp_a, "");
2406 res = LLVMBuildFSub(builder, two, res, "");
2407 res = LLVMBuildFMul(builder, rcp_a, res, "");
2408
2409 return res;
2410 }
2411
2412
2413 LLVMValueRef
2414 lp_build_rcp(struct lp_build_context *bld,
2415 LLVMValueRef a)
2416 {
2417 LLVMBuilderRef builder = bld->gallivm->builder;
2418 const struct lp_type type = bld->type;
2419
2420 assert(lp_check_value(type, a));
2421
2422 if(a == bld->zero)
2423 return bld->undef;
2424 if(a == bld->one)
2425 return bld->one;
2426 if(a == bld->undef)
2427 return bld->undef;
2428
2429 assert(type.floating);
2430
2431 if(LLVMIsConstant(a))
2432 return LLVMConstFDiv(bld->one, a);
2433
2434 /*
2435 * We don't use RCPPS because:
2436 * - it only has 10bits of precision
2437 * - it doesn't even get the reciprocate of 1.0 exactly
2438 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2439 * - for recent processors the benefit over DIVPS is marginal, a case
2440 * dependent
2441 *
2442 * We could still use it on certain processors if benchmarks show that the
2443 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2444 * particular uses that require less workarounds.
2445 */
2446
2447 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2448 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2449 const unsigned num_iterations = 0;
2450 LLVMValueRef res;
2451 unsigned i;
2452 const char *intrinsic = NULL;
2453
2454 if (type.length == 4) {
2455 intrinsic = "llvm.x86.sse.rcp.ps";
2456 }
2457 else {
2458 intrinsic = "llvm.x86.avx.rcp.ps.256";
2459 }
2460
2461 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2462
2463 for (i = 0; i < num_iterations; ++i) {
2464 res = lp_build_rcp_refine(bld, a, res);
2465 }
2466
2467 return res;
2468 }
2469
2470 return LLVMBuildFDiv(builder, bld->one, a, "");
2471 }
2472
2473
2474 /**
2475 * Do one Newton-Raphson step to improve rsqrt precision:
2476 *
2477 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2478 *
2479 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2480 */
2481 static INLINE LLVMValueRef
2482 lp_build_rsqrt_refine(struct lp_build_context *bld,
2483 LLVMValueRef a,
2484 LLVMValueRef rsqrt_a)
2485 {
2486 LLVMBuilderRef builder = bld->gallivm->builder;
2487 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2488 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2489 LLVMValueRef res;
2490
2491 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2492 res = LLVMBuildFMul(builder, a, res, "");
2493 res = LLVMBuildFSub(builder, three, res, "");
2494 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2495 res = LLVMBuildFMul(builder, half, res, "");
2496
2497 return res;
2498 }
2499
2500
2501 /**
2502 * Generate 1/sqrt(a).
2503 * Result is undefined for values < 0, infinity for +0.
2504 */
2505 LLVMValueRef
2506 lp_build_rsqrt(struct lp_build_context *bld,
2507 LLVMValueRef a)
2508 {
2509 LLVMBuilderRef builder = bld->gallivm->builder;
2510 const struct lp_type type = bld->type;
2511
2512 assert(lp_check_value(type, a));
2513
2514 assert(type.floating);
2515
2516 /*
2517 * This should be faster but all denormals will end up as infinity.
2518 */
2519 if (0 && lp_build_fast_rsqrt_available(type)) {
2520 const unsigned num_iterations = 1;
2521 LLVMValueRef res;
2522 unsigned i;
2523
2524 /* rsqrt(1.0) != 1.0 here */
2525 res = lp_build_fast_rsqrt(bld, a);
2526
2527 if (num_iterations) {
2528 /*
2529 * Newton-Raphson will result in NaN instead of infinity for zero,
2530 * and NaN instead of zero for infinity.
2531 * Also, need to ensure rsqrt(1.0) == 1.0.
2532 * All numbers smaller than FLT_MIN will result in +infinity
2533 * (rsqrtps treats all denormals as zero).
2534 */
2535 /*
2536 * Certain non-c99 compilers don't know INFINITY and might not support
2537 * hacks to evaluate it at compile time neither.
2538 */
2539 const unsigned posinf_int = 0x7F800000;
2540 LLVMValueRef cmp;
2541 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2542 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2543
2544 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2545
2546 for (i = 0; i < num_iterations; ++i) {
2547 res = lp_build_rsqrt_refine(bld, a, res);
2548 }
2549 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2550 res = lp_build_select(bld, cmp, inf, res);
2551 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2552 res = lp_build_select(bld, cmp, bld->zero, res);
2553 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2554 res = lp_build_select(bld, cmp, bld->one, res);
2555 }
2556
2557 return res;
2558 }
2559
2560 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2561 }
2562
2563 /**
2564 * If there's a fast (inaccurate) rsqrt instruction available
2565 * (caller may want to avoid to call rsqrt_fast if it's not available,
2566 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2567 * unavailable it would result in sqrt/div/mul so obviously
2568 * much better to just call sqrt, skipping both div and mul).
2569 */
2570 boolean
2571 lp_build_fast_rsqrt_available(struct lp_type type)
2572 {
2573 assert(type.floating);
2574
2575 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2576 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2577 return true;
2578 }
2579 return false;
2580 }
2581
2582
2583 /**
2584 * Generate 1/sqrt(a).
2585 * Result is undefined for values < 0, infinity for +0.
2586 * Precision is limited, only ~10 bits guaranteed
2587 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2588 */
2589 LLVMValueRef
2590 lp_build_fast_rsqrt(struct lp_build_context *bld,
2591 LLVMValueRef a)
2592 {
2593 LLVMBuilderRef builder = bld->gallivm->builder;
2594 const struct lp_type type = bld->type;
2595
2596 assert(lp_check_value(type, a));
2597
2598 if (lp_build_fast_rsqrt_available(type)) {
2599 const char *intrinsic = NULL;
2600
2601 if (type.length == 4) {
2602 intrinsic = "llvm.x86.sse.rsqrt.ps";
2603 }
2604 else {
2605 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2606 }
2607 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2608 }
2609 else {
2610 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2611 }
2612 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2613 }
2614
2615
2616 /**
2617 * Generate sin(a) or cos(a) using polynomial approximation.
2618 * TODO: it might be worth recognizing sin and cos using same source
2619 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2620 * would be way cheaper than calculating (nearly) everything twice...
2621 * Not sure it's common enough to be worth bothering however, scs
2622 * opcode could also benefit from calculating both though.
2623 */
2624 static LLVMValueRef
2625 lp_build_sin_or_cos(struct lp_build_context *bld,
2626 LLVMValueRef a,
2627 boolean cos)
2628 {
2629 struct gallivm_state *gallivm = bld->gallivm;
2630 LLVMBuilderRef b = gallivm->builder;
2631 struct lp_type int_type = lp_int_type(bld->type);
2632
2633 /*
2634 * take the absolute value,
2635 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2636 */
2637
2638 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2639 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2640
2641 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2642 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2643
2644 /*
2645 * scale by 4/Pi
2646 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2647 */
2648
2649 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2650 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2651
2652 /*
2653 * store the integer part of y in mm0
2654 * emm2 = _mm_cvttps_epi32(y);
2655 */
2656
2657 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2658
2659 /*
2660 * j=(j+1) & (~1) (see the cephes sources)
2661 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2662 */
2663
2664 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2665 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2666 /*
2667 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2668 */
2669 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2670 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2671
2672 /*
2673 * y = _mm_cvtepi32_ps(emm2);
2674 */
2675 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2676
2677 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2678 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2679 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2680 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2681
2682 /*
2683 * Argument used for poly selection and sign bit determination
2684 * is different for sin vs. cos.
2685 */
2686 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2687 emm2_and;
2688
2689 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2690 LLVMBuildNot(b, emm2_2, ""), ""),
2691 const_29, "sign_bit") :
2692 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2693 LLVMBuildShl(b, emm2_add,
2694 const_29, ""), ""),
2695 sign_mask, "sign_bit");
2696
2697 /*
2698 * get the polynom selection mask
2699 * there is one polynom for 0 <= x <= Pi/4
2700 * and another one for Pi/4<x<=Pi/2
2701 * Both branches will be computed.
2702 *
2703 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2704 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2705 */
2706
2707 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2708 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2709 int_type, PIPE_FUNC_EQUAL,
2710 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2711
2712 /*
2713 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2714 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2715 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2716 */
2717 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2718 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2719 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2720
2721 /*
2722 * The magic pass: "Extended precision modular arithmetic"
2723 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2724 * xmm1 = _mm_mul_ps(y, xmm1);
2725 * xmm2 = _mm_mul_ps(y, xmm2);
2726 * xmm3 = _mm_mul_ps(y, xmm3);
2727 */
2728 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2729 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2730 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2731
2732 /*
2733 * x = _mm_add_ps(x, xmm1);
2734 * x = _mm_add_ps(x, xmm2);
2735 * x = _mm_add_ps(x, xmm3);
2736 */
2737
2738 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2739 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2740 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2741
2742 /*
2743 * Evaluate the first polynom (0 <= x <= Pi/4)
2744 *
2745 * z = _mm_mul_ps(x,x);
2746 */
2747 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2748
2749 /*
2750 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2751 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2752 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2753 */
2754 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2755 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2756 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2757
2758 /*
2759 * y = *(v4sf*)_ps_coscof_p0;
2760 * y = _mm_mul_ps(y, z);
2761 */
2762 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2763 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2764 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2765 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2766 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2767 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2768
2769
2770 /*
2771 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2772 * y = _mm_sub_ps(y, tmp);
2773 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2774 */
2775 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2776 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2777 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2778 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2779 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2780
2781 /*
2782 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2783 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2784 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2785 */
2786 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2787 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2788 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2789
2790 /*
2791 * Evaluate the second polynom (Pi/4 <= x <= 0)
2792 *
2793 * y2 = *(v4sf*)_ps_sincof_p0;
2794 * y2 = _mm_mul_ps(y2, z);
2795 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2796 * y2 = _mm_mul_ps(y2, z);
2797 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2798 * y2 = _mm_mul_ps(y2, z);
2799 * y2 = _mm_mul_ps(y2, x);
2800 * y2 = _mm_add_ps(y2, x);
2801 */
2802
2803 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2804 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2805 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2806 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2807 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2808 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2809 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2810
2811 /*
2812 * select the correct result from the two polynoms
2813 * xmm3 = poly_mask;
2814 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2815 * y = _mm_andnot_ps(xmm3, y);
2816 * y = _mm_or_ps(y,y2);
2817 */
2818 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2819 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2820 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2821 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2822 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2823 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2824
2825 /*
2826 * update the sign
2827 * y = _mm_xor_ps(y, sign_bit);
2828 */
2829 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2830 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2831
2832 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2833
2834 /* clamp output to be within [-1, 1] */
2835 y_result = lp_build_clamp(bld, y_result,
2836 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2837 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2838 /* If a is -inf, inf or NaN then return NaN */
2839 y_result = lp_build_select(bld, isfinite, y_result,
2840 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2841 return y_result;
2842 }
2843
2844
2845 /**
2846 * Generate sin(a)
2847 */
2848 LLVMValueRef
2849 lp_build_sin(struct lp_build_context *bld,
2850 LLVMValueRef a)
2851 {
2852 return lp_build_sin_or_cos(bld, a, FALSE);
2853 }
2854
2855
2856 /**
2857 * Generate cos(a)
2858 */
2859 LLVMValueRef
2860 lp_build_cos(struct lp_build_context *bld,
2861 LLVMValueRef a)
2862 {
2863 return lp_build_sin_or_cos(bld, a, TRUE);
2864 }
2865
2866
2867 /**
2868 * Generate pow(x, y)
2869 */
2870 LLVMValueRef
2871 lp_build_pow(struct lp_build_context *bld,
2872 LLVMValueRef x,
2873 LLVMValueRef y)
2874 {
2875 /* TODO: optimize the constant case */
2876 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2877 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2878 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2879 __FUNCTION__);
2880 }
2881
2882 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2883 }
2884
2885
2886 /**
2887 * Generate exp(x)
2888 */
2889 LLVMValueRef
2890 lp_build_exp(struct lp_build_context *bld,
2891 LLVMValueRef x)
2892 {
2893 /* log2(e) = 1/log(2) */
2894 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2895 1.4426950408889634);
2896
2897 assert(lp_check_value(bld->type, x));
2898
2899 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2900 }
2901
2902
2903 /**
2904 * Generate log(x)
2905 * Behavior is undefined with infs, 0s and nans
2906 */
2907 LLVMValueRef
2908 lp_build_log(struct lp_build_context *bld,
2909 LLVMValueRef x)
2910 {
2911 /* log(2) */
2912 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2913 0.69314718055994529);
2914
2915 assert(lp_check_value(bld->type, x));
2916
2917 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2918 }
2919
2920 /**
2921 * Generate log(x) that handles edge cases (infs, 0s and nans)
2922 */
2923 LLVMValueRef
2924 lp_build_log_safe(struct lp_build_context *bld,
2925 LLVMValueRef x)
2926 {
2927 /* log(2) */
2928 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2929 0.69314718055994529);
2930
2931 assert(lp_check_value(bld->type, x));
2932
2933 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2934 }
2935
2936
2937 /**
2938 * Generate polynomial.
2939 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2940 */
2941 LLVMValueRef
2942 lp_build_polynomial(struct lp_build_context *bld,
2943 LLVMValueRef x,
2944 const double *coeffs,
2945 unsigned num_coeffs)
2946 {
2947 const struct lp_type type = bld->type;
2948 LLVMValueRef even = NULL, odd = NULL;
2949 LLVMValueRef x2;
2950 unsigned i;
2951
2952 assert(lp_check_value(bld->type, x));
2953
2954 /* TODO: optimize the constant case */
2955 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2956 LLVMIsConstant(x)) {
2957 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2958 __FUNCTION__);
2959 }
2960
2961 /*
2962 * Calculate odd and even terms seperately to decrease data dependency
2963 * Ex:
2964 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2965 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2966 */
2967 x2 = lp_build_mul(bld, x, x);
2968
2969 for (i = num_coeffs; i--; ) {
2970 LLVMValueRef coeff;
2971
2972 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2973
2974 if (i % 2 == 0) {
2975 if (even)
2976 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2977 else
2978 even = coeff;
2979 } else {
2980 if (odd)
2981 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2982 else
2983 odd = coeff;
2984 }
2985 }
2986
2987 if (odd)
2988 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2989 else if (even)
2990 return even;
2991 else
2992 return bld->undef;
2993 }
2994
2995
2996 /**
2997 * Minimax polynomial fit of 2**x, in range [0, 1[
2998 */
2999 const double lp_build_exp2_polynomial[] = {
3000 #if EXP_POLY_DEGREE == 5
3001 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3002 0.693153073200168932794,
3003 0.240153617044375388211,
3004 0.0558263180532956664775,
3005 0.00898934009049466391101,
3006 0.00187757667519147912699
3007 #elif EXP_POLY_DEGREE == 4
3008 1.00000259337069434683,
3009 0.693003834469974940458,
3010 0.24144275689150793076,
3011 0.0520114606103070150235,
3012 0.0135341679161270268764
3013 #elif EXP_POLY_DEGREE == 3
3014 0.999925218562710312959,
3015 0.695833540494823811697,
3016 0.226067155427249155588,
3017 0.0780245226406372992967
3018 #elif EXP_POLY_DEGREE == 2
3019 1.00172476321474503578,
3020 0.657636275736077639316,
3021 0.33718943461968720704
3022 #else
3023 #error
3024 #endif
3025 };
3026
3027
3028 LLVMValueRef
3029 lp_build_exp2(struct lp_build_context *bld,
3030 LLVMValueRef x)
3031 {
3032 LLVMBuilderRef builder = bld->gallivm->builder;
3033 const struct lp_type type = bld->type;
3034 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3035 LLVMValueRef ipart = NULL;
3036 LLVMValueRef fpart = NULL;
3037 LLVMValueRef expipart = NULL;
3038 LLVMValueRef expfpart = NULL;
3039 LLVMValueRef res = NULL;
3040
3041 assert(lp_check_value(bld->type, x));
3042
3043
3044 /* TODO: optimize the constant case */
3045 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3046 LLVMIsConstant(x)) {
3047 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3048 __FUNCTION__);
3049 }
3050
3051 assert(type.floating && type.width == 32);
3052
3053 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3054 * the result is INF and if it's smaller than -126.9 the result is 0 */
3055 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3056 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
3057 x = lp_build_max(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), x);
3058
3059 /* ipart = floor(x) */
3060 /* fpart = x - ipart */
3061 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3062
3063
3064
3065 /* expipart = (float) (1 << ipart) */
3066 expipart = LLVMBuildAdd(builder, ipart,
3067 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3068 expipart = LLVMBuildShl(builder, expipart,
3069 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3070 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3071
3072
3073 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3074 Elements(lp_build_exp2_polynomial));
3075
3076 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3077
3078
3079 return res;
3080 }
3081
3082
3083
3084 /**
3085 * Extract the exponent of a IEEE-754 floating point value.
3086 *
3087 * Optionally apply an integer bias.
3088 *
3089 * Result is an integer value with
3090 *
3091 * ifloor(log2(x)) + bias
3092 */
3093 LLVMValueRef
3094 lp_build_extract_exponent(struct lp_build_context *bld,
3095 LLVMValueRef x,
3096 int bias)
3097 {
3098 LLVMBuilderRef builder = bld->gallivm->builder;
3099 const struct lp_type type = bld->type;
3100 unsigned mantissa = lp_mantissa(type);
3101 LLVMValueRef res;
3102
3103 assert(type.floating);
3104
3105 assert(lp_check_value(bld->type, x));
3106
3107 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3108
3109 res = LLVMBuildLShr(builder, x,
3110 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3111 res = LLVMBuildAnd(builder, res,
3112 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3113 res = LLVMBuildSub(builder, res,
3114 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3115
3116 return res;
3117 }
3118
3119
3120 /**
3121 * Extract the mantissa of the a floating.
3122 *
3123 * Result is a floating point value with
3124 *
3125 * x / floor(log2(x))
3126 */
3127 LLVMValueRef
3128 lp_build_extract_mantissa(struct lp_build_context *bld,
3129 LLVMValueRef x)
3130 {
3131 LLVMBuilderRef builder = bld->gallivm->builder;
3132 const struct lp_type type = bld->type;
3133 unsigned mantissa = lp_mantissa(type);
3134 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3135 (1ULL << mantissa) - 1);
3136 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3137 LLVMValueRef res;
3138
3139 assert(lp_check_value(bld->type, x));
3140
3141 assert(type.floating);
3142
3143 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3144
3145 /* res = x / 2**ipart */
3146 res = LLVMBuildAnd(builder, x, mantmask, "");
3147 res = LLVMBuildOr(builder, res, one, "");
3148 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3149
3150 return res;
3151 }
3152
3153
3154
3155 /**
3156 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3157 * These coefficients can be generate with
3158 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3159 */
3160 const double lp_build_log2_polynomial[] = {
3161 #if LOG_POLY_DEGREE == 5
3162 2.88539008148777786488L,
3163 0.961796878841293367824L,
3164 0.577058946784739859012L,
3165 0.412914355135828735411L,
3166 0.308591899232910175289L,
3167 0.352376952300281371868L,
3168 #elif LOG_POLY_DEGREE == 4
3169 2.88539009343309178325L,
3170 0.961791550404184197881L,
3171 0.577440339438736392009L,
3172 0.403343858251329912514L,
3173 0.406718052498846252698L,
3174 #elif LOG_POLY_DEGREE == 3
3175 2.88538959748872753838L,
3176 0.961932915889597772928L,
3177 0.571118517972136195241L,
3178 0.493997535084709500285L,
3179 #else
3180 #error
3181 #endif
3182 };
3183
3184 /**
3185 * See http://www.devmaster.net/forums/showthread.php?p=43580
3186 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3187 * http://www.nezumi.demon.co.uk/consult/logx.htm
3188 *
3189 * If handle_edge_cases is true the function will perform computations
3190 * to match the required D3D10+ behavior for each of the edge cases.
3191 * That means that if input is:
3192 * - less than zero (to and including -inf) then NaN will be returned
3193 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3194 * - +infinity, then +infinity will be returned
3195 * - NaN, then NaN will be returned
3196 *
3197 * Those checks are fairly expensive so if you don't need them make sure
3198 * handle_edge_cases is false.
3199 */
3200 void
3201 lp_build_log2_approx(struct lp_build_context *bld,
3202 LLVMValueRef x,
3203 LLVMValueRef *p_exp,
3204 LLVMValueRef *p_floor_log2,
3205 LLVMValueRef *p_log2,
3206 boolean handle_edge_cases)
3207 {
3208 LLVMBuilderRef builder = bld->gallivm->builder;
3209 const struct lp_type type = bld->type;
3210 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3211 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3212
3213 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3214 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3215 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3216
3217 LLVMValueRef i = NULL;
3218 LLVMValueRef y = NULL;
3219 LLVMValueRef z = NULL;
3220 LLVMValueRef exp = NULL;
3221 LLVMValueRef mant = NULL;
3222 LLVMValueRef logexp = NULL;
3223 LLVMValueRef logmant = NULL;
3224 LLVMValueRef res = NULL;
3225
3226 assert(lp_check_value(bld->type, x));
3227
3228 if(p_exp || p_floor_log2 || p_log2) {
3229 /* TODO: optimize the constant case */
3230 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3231 LLVMIsConstant(x)) {
3232 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3233 __FUNCTION__);
3234 }
3235
3236 assert(type.floating && type.width == 32);
3237
3238 /*
3239 * We don't explicitly handle denormalized numbers. They will yield a
3240 * result in the neighbourhood of -127, which appears to be adequate
3241 * enough.
3242 */
3243
3244 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3245
3246 /* exp = (float) exponent(x) */
3247 exp = LLVMBuildAnd(builder, i, expmask, "");
3248 }
3249
3250 if(p_floor_log2 || p_log2) {
3251 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3252 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3253 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3254 }
3255
3256 if(p_log2) {
3257 /* mant = 1 + (float) mantissa(x) */
3258 mant = LLVMBuildAnd(builder, i, mantmask, "");
3259 mant = LLVMBuildOr(builder, mant, one, "");
3260 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3261
3262 /* y = (mant - 1) / (mant + 1) */
3263 y = lp_build_div(bld,
3264 lp_build_sub(bld, mant, bld->one),
3265 lp_build_add(bld, mant, bld->one)
3266 );
3267
3268 /* z = y^2 */
3269 z = lp_build_mul(bld, y, y);
3270
3271 /* compute P(z) */
3272 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3273 Elements(lp_build_log2_polynomial));
3274
3275 /* logmant = y * P(z) */
3276 logmant = lp_build_mul(bld, y, logmant);
3277
3278 res = lp_build_add(bld, logmant, logexp);
3279
3280 if (type.floating && handle_edge_cases) {
3281 LLVMValueRef negmask, infmask, zmask;
3282 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3283 lp_build_const_vec(bld->gallivm, type, 0.0f));
3284 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3285 lp_build_const_vec(bld->gallivm, type, 0.0f));
3286 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3287 lp_build_const_vec(bld->gallivm, type, INFINITY));
3288
3289 /* If x is qual to inf make sure we return inf */
3290 res = lp_build_select(bld, infmask,
3291 lp_build_const_vec(bld->gallivm, type, INFINITY),
3292 res);
3293 /* If x is qual to 0, return -inf */
3294 res = lp_build_select(bld, zmask,
3295 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3296 res);
3297 /* If x is nan or less than 0, return nan */
3298 res = lp_build_select(bld, negmask,
3299 lp_build_const_vec(bld->gallivm, type, NAN),
3300 res);
3301 }
3302 }
3303
3304 if(p_exp) {
3305 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3306 *p_exp = exp;
3307 }
3308
3309 if(p_floor_log2)
3310 *p_floor_log2 = logexp;
3311
3312 if(p_log2)
3313 *p_log2 = res;
3314 }
3315
3316
3317 /*
3318 * log2 implementation which doesn't have special code to
3319 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3320 * the results for those cases are undefined.
3321 */
3322 LLVMValueRef
3323 lp_build_log2(struct lp_build_context *bld,
3324 LLVMValueRef x)
3325 {
3326 LLVMValueRef res;
3327 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3328 return res;
3329 }
3330
3331 /*
3332 * Version of log2 which handles all edge cases.
3333 * Look at documentation of lp_build_log2_approx for
3334 * description of the behavior for each of the edge cases.
3335 */
3336 LLVMValueRef
3337 lp_build_log2_safe(struct lp_build_context *bld,
3338 LLVMValueRef x)
3339 {
3340 LLVMValueRef res;
3341 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3342 return res;
3343 }
3344
3345
3346 /**
3347 * Faster (and less accurate) log2.
3348 *
3349 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3350 *
3351 * Piece-wise linear approximation, with exact results when x is a
3352 * power of two.
3353 *
3354 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3355 */
3356 LLVMValueRef
3357 lp_build_fast_log2(struct lp_build_context *bld,
3358 LLVMValueRef x)
3359 {
3360 LLVMBuilderRef builder = bld->gallivm->builder;
3361 LLVMValueRef ipart;
3362 LLVMValueRef fpart;
3363
3364 assert(lp_check_value(bld->type, x));
3365
3366 assert(bld->type.floating);
3367
3368 /* ipart = floor(log2(x)) - 1 */
3369 ipart = lp_build_extract_exponent(bld, x, -1);
3370 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3371
3372 /* fpart = x / 2**ipart */
3373 fpart = lp_build_extract_mantissa(bld, x);
3374
3375 /* ipart + fpart */
3376 return LLVMBuildFAdd(builder, ipart, fpart, "");
3377 }
3378
3379
3380 /**
3381 * Fast implementation of iround(log2(x)).
3382 *
3383 * Not an approximation -- it should give accurate results all the time.
3384 */
3385 LLVMValueRef
3386 lp_build_ilog2(struct lp_build_context *bld,
3387 LLVMValueRef x)
3388 {
3389 LLVMBuilderRef builder = bld->gallivm->builder;
3390 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3391 LLVMValueRef ipart;
3392
3393 assert(bld->type.floating);
3394
3395 assert(lp_check_value(bld->type, x));
3396
3397 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3398 x = LLVMBuildFMul(builder, x, sqrt2, "");
3399
3400 /* ipart = floor(log2(x) + 0.5) */
3401 ipart = lp_build_extract_exponent(bld, x, 0);
3402
3403 return ipart;
3404 }
3405
3406 LLVMValueRef
3407 lp_build_mod(struct lp_build_context *bld,
3408 LLVMValueRef x,
3409 LLVMValueRef y)
3410 {
3411 LLVMBuilderRef builder = bld->gallivm->builder;
3412 LLVMValueRef res;
3413 const struct lp_type type = bld->type;
3414
3415 assert(lp_check_value(type, x));
3416 assert(lp_check_value(type, y));
3417
3418 if (type.floating)
3419 res = LLVMBuildFRem(builder, x, y, "");
3420 else if (type.sign)
3421 res = LLVMBuildSRem(builder, x, y, "");
3422 else
3423 res = LLVMBuildURem(builder, x, y, "");
3424 return res;
3425 }
3426
3427
3428 /*
3429 * For floating inputs it creates and returns a mask
3430 * which is all 1's for channels which are NaN.
3431 * Channels inside x which are not NaN will be 0.
3432 */
3433 LLVMValueRef
3434 lp_build_isnan(struct lp_build_context *bld,
3435 LLVMValueRef x)
3436 {
3437 LLVMValueRef mask;
3438 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3439
3440 assert(bld->type.floating);
3441 assert(lp_check_value(bld->type, x));
3442
3443 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3444 "isnotnan");
3445 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3446 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3447 return mask;
3448 }
3449
3450 /* Returns all 1's for floating point numbers that are
3451 * finite numbers and returns all zeros for -inf,
3452 * inf and nan's */
3453 LLVMValueRef
3454 lp_build_isfinite(struct lp_build_context *bld,
3455 LLVMValueRef x)
3456 {
3457 LLVMBuilderRef builder = bld->gallivm->builder;
3458 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3459 struct lp_type int_type = lp_int_type(bld->type);
3460 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3461 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3462 0x7f800000);
3463
3464 if (!bld->type.floating) {
3465 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3466 }
3467 assert(bld->type.floating);
3468 assert(lp_check_value(bld->type, x));
3469 assert(bld->type.width == 32);
3470
3471 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3472 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3473 intx, infornan32);
3474 }
3475
3476 /*
3477 * Returns true if the number is nan or inf and false otherwise.
3478 * The input has to be a floating point vector.
3479 */
3480 LLVMValueRef
3481 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3482 const struct lp_type type,
3483 LLVMValueRef x)
3484 {
3485 LLVMBuilderRef builder = gallivm->builder;
3486 struct lp_type int_type = lp_int_type(type);
3487 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3488 0x7f800000);
3489 LLVMValueRef ret;
3490
3491 assert(type.floating);
3492
3493 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3494 ret = LLVMBuildAnd(builder, ret, const0, "");
3495 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3496 ret, const0);
3497
3498 return ret;
3499 }
3500
3501
3502 LLVMValueRef
3503 lp_build_fpstate_get(struct gallivm_state *gallivm)
3504 {
3505 if (util_cpu_caps.has_sse) {
3506 LLVMBuilderRef builder = gallivm->builder;
3507 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3508 gallivm,
3509 LLVMInt32TypeInContext(gallivm->context),
3510 "mxcsr_ptr");
3511 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3512 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3513 lp_build_intrinsic(builder,
3514 "llvm.x86.sse.stmxcsr",
3515 LLVMVoidTypeInContext(gallivm->context),
3516 &mxcsr_ptr8, 1);
3517 return mxcsr_ptr;
3518 }
3519 return 0;
3520 }
3521
3522 void
3523 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3524 boolean zero)
3525 {
3526 if (util_cpu_caps.has_sse) {
3527 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3528 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3529
3530 LLVMBuilderRef builder = gallivm->builder;
3531 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3532 LLVMValueRef mxcsr =
3533 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3534
3535 if (util_cpu_caps.has_daz) {
3536 /* Enable denormals are zero mode */
3537 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3538 }
3539 if (zero) {
3540 mxcsr = LLVMBuildOr(builder, mxcsr,
3541 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3542 } else {
3543 mxcsr = LLVMBuildAnd(builder, mxcsr,
3544 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3545 }
3546
3547 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3548 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3549 }
3550 }
3551
3552 void
3553 lp_build_fpstate_set(struct gallivm_state *gallivm,
3554 LLVMValueRef mxcsr_ptr)
3555 {
3556 if (util_cpu_caps.has_sse) {
3557 LLVMBuilderRef builder = gallivm->builder;
3558 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3559 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3560 lp_build_intrinsic(builder,
3561 "llvm.x86.sse.ldmxcsr",
3562 LLVMVoidTypeInContext(gallivm->context),
3563 &mxcsr_ptr, 1);
3564 }
3565 }