gallivm: Use llvm.fabs.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
69 #endif
70
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
73 #endif
74
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
77 #endif
78
79 #define EXP_POLY_DEGREE 5
80
81 #define LOG_POLY_DEGREE 4
82
83
84 /**
85 * Generate min(a, b)
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
89 */
90 static LLVMValueRef
91 lp_build_min_simple(struct lp_build_context *bld,
92 LLVMValueRef a,
93 LLVMValueRef b,
94 enum gallivm_nan_behavior nan_behavior)
95 {
96 const struct lp_type type = bld->type;
97 const char *intrinsic = NULL;
98 unsigned intr_size = 0;
99 LLVMValueRef cond;
100
101 assert(lp_check_value(type, a));
102 assert(lp_check_value(type, b));
103
104 /* TODO: optimize the constant case */
105
106 if (type.floating && util_cpu_caps.has_sse) {
107 if (type.width == 32) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse.min.ss";
110 intr_size = 128;
111 }
112 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse.min.ps";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.ps.256";
118 intr_size = 256;
119 }
120 }
121 if (type.width == 64 && util_cpu_caps.has_sse2) {
122 if (type.length == 1) {
123 intrinsic = "llvm.x86.sse2.min.sd";
124 intr_size = 128;
125 }
126 else if (type.length == 2 || !util_cpu_caps.has_avx) {
127 intrinsic = "llvm.x86.sse2.min.pd";
128 intr_size = 128;
129 }
130 else {
131 intrinsic = "llvm.x86.avx.min.pd.256";
132 intr_size = 256;
133 }
134 }
135 }
136 else if (type.floating && util_cpu_caps.has_altivec) {
137 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
138 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
147 intr_size = 128;
148 if ((type.width == 8 || type.width == 16) &&
149 (type.width * type.length <= 64) &&
150 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
151 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
152 __FUNCTION__);
153 }
154 if (type.width == 8 && !type.sign) {
155 intrinsic = "llvm.x86.sse2.pminu.b";
156 }
157 else if (type.width == 16 && type.sign) {
158 intrinsic = "llvm.x86.sse2.pmins.w";
159 }
160 if (util_cpu_caps.has_sse4_1) {
161 if (type.width == 8 && type.sign) {
162 intrinsic = "llvm.x86.sse41.pminsb";
163 }
164 if (type.width == 16 && !type.sign) {
165 intrinsic = "llvm.x86.sse41.pminuw";
166 }
167 if (type.width == 32 && !type.sign) {
168 intrinsic = "llvm.x86.sse41.pminud";
169 }
170 if (type.width == 32 && type.sign) {
171 intrinsic = "llvm.x86.sse41.pminsd";
172 }
173 }
174 } else if (util_cpu_caps.has_altivec) {
175 intr_size = 128;
176 if (type.width == 8) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminub";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsb";
181 }
182 } else if (type.width == 16) {
183 if (!type.sign) {
184 intrinsic = "llvm.ppc.altivec.vminuh";
185 } else {
186 intrinsic = "llvm.ppc.altivec.vminsh";
187 }
188 } else if (type.width == 32) {
189 if (!type.sign) {
190 intrinsic = "llvm.ppc.altivec.vminuw";
191 } else {
192 intrinsic = "llvm.ppc.altivec.vminsw";
193 }
194 }
195 }
196
197 if (intrinsic) {
198 /* We need to handle nan's for floating point numbers. If one of the
199 * inputs is nan the other should be returned (required by both D3D10+
200 * and OpenCL).
201 * The sse intrinsics return the second operator in case of nan by
202 * default so we need to special code to handle those.
203 */
204 if (util_cpu_caps.has_sse && type.floating &&
205 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
206 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
207 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
208 LLVMValueRef isnan, min;
209 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
210 type,
211 intr_size, a, b);
212 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
213 isnan = lp_build_isnan(bld, b);
214 return lp_build_select(bld, isnan, a, min);
215 } else {
216 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
217 isnan = lp_build_isnan(bld, a);
218 return lp_build_select(bld, isnan, a, min);
219 }
220 } else {
221 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
222 type,
223 intr_size, a, b);
224 }
225 }
226
227 if (type.floating) {
228 switch (nan_behavior) {
229 case GALLIVM_NAN_RETURN_NAN: {
230 LLVMValueRef isnan = lp_build_isnan(bld, b);
231 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
232 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
233 return lp_build_select(bld, cond, a, b);
234 }
235 break;
236 case GALLIVM_NAN_RETURN_OTHER: {
237 LLVMValueRef isnan = lp_build_isnan(bld, a);
238 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
239 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
240 return lp_build_select(bld, cond, a, b);
241 }
242 break;
243 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
244 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
245 return lp_build_select(bld, cond, a, b);
246 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
247 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
248 return lp_build_select(bld, cond, b, a);
249 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 break;
253 default:
254 assert(0);
255 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
256 return lp_build_select(bld, cond, a, b);
257 }
258 } else {
259 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
260 return lp_build_select(bld, cond, a, b);
261 }
262 }
263
264
265 /**
266 * Generate max(a, b)
267 * No checks for special case values of a or b = 1 or 0 are done.
268 * NaN's are handled according to the behavior specified by the
269 * nan_behavior argument.
270 */
271 static LLVMValueRef
272 lp_build_max_simple(struct lp_build_context *bld,
273 LLVMValueRef a,
274 LLVMValueRef b,
275 enum gallivm_nan_behavior nan_behavior)
276 {
277 const struct lp_type type = bld->type;
278 const char *intrinsic = NULL;
279 unsigned intr_size = 0;
280 LLVMValueRef cond;
281
282 assert(lp_check_value(type, a));
283 assert(lp_check_value(type, b));
284
285 /* TODO: optimize the constant case */
286
287 if (type.floating && util_cpu_caps.has_sse) {
288 if (type.width == 32) {
289 if (type.length == 1) {
290 intrinsic = "llvm.x86.sse.max.ss";
291 intr_size = 128;
292 }
293 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
294 intrinsic = "llvm.x86.sse.max.ps";
295 intr_size = 128;
296 }
297 else {
298 intrinsic = "llvm.x86.avx.max.ps.256";
299 intr_size = 256;
300 }
301 }
302 if (type.width == 64 && util_cpu_caps.has_sse2) {
303 if (type.length == 1) {
304 intrinsic = "llvm.x86.sse2.max.sd";
305 intr_size = 128;
306 }
307 else if (type.length == 2 || !util_cpu_caps.has_avx) {
308 intrinsic = "llvm.x86.sse2.max.pd";
309 intr_size = 128;
310 }
311 else {
312 intrinsic = "llvm.x86.avx.max.pd.256";
313 intr_size = 256;
314 }
315 }
316 }
317 else if (type.floating && util_cpu_caps.has_altivec) {
318 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
319 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
320 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
321 __FUNCTION__);
322 }
323 if (type.width == 32 || type.length == 4) {
324 intrinsic = "llvm.ppc.altivec.vmaxfp";
325 intr_size = 128;
326 }
327 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
328 intr_size = 128;
329 if ((type.width == 8 || type.width == 16) &&
330 (type.width * type.length <= 64) &&
331 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
332 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
333 __FUNCTION__);
334 }
335 if (type.width == 8 && !type.sign) {
336 intrinsic = "llvm.x86.sse2.pmaxu.b";
337 intr_size = 128;
338 }
339 else if (type.width == 16 && type.sign) {
340 intrinsic = "llvm.x86.sse2.pmaxs.w";
341 }
342 if (util_cpu_caps.has_sse4_1) {
343 if (type.width == 8 && type.sign) {
344 intrinsic = "llvm.x86.sse41.pmaxsb";
345 }
346 if (type.width == 16 && !type.sign) {
347 intrinsic = "llvm.x86.sse41.pmaxuw";
348 }
349 if (type.width == 32 && !type.sign) {
350 intrinsic = "llvm.x86.sse41.pmaxud";
351 }
352 if (type.width == 32 && type.sign) {
353 intrinsic = "llvm.x86.sse41.pmaxsd";
354 }
355 }
356 } else if (util_cpu_caps.has_altivec) {
357 intr_size = 128;
358 if (type.width == 8) {
359 if (!type.sign) {
360 intrinsic = "llvm.ppc.altivec.vmaxub";
361 } else {
362 intrinsic = "llvm.ppc.altivec.vmaxsb";
363 }
364 } else if (type.width == 16) {
365 if (!type.sign) {
366 intrinsic = "llvm.ppc.altivec.vmaxuh";
367 } else {
368 intrinsic = "llvm.ppc.altivec.vmaxsh";
369 }
370 } else if (type.width == 32) {
371 if (!type.sign) {
372 intrinsic = "llvm.ppc.altivec.vmaxuw";
373 } else {
374 intrinsic = "llvm.ppc.altivec.vmaxsw";
375 }
376 }
377 }
378
379 if (intrinsic) {
380 if (util_cpu_caps.has_sse && type.floating &&
381 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
382 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
383 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
384 LLVMValueRef isnan, max;
385 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
386 type,
387 intr_size, a, b);
388 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
389 isnan = lp_build_isnan(bld, b);
390 return lp_build_select(bld, isnan, a, max);
391 } else {
392 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
393 isnan = lp_build_isnan(bld, a);
394 return lp_build_select(bld, isnan, a, max);
395 }
396 } else {
397 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
398 type,
399 intr_size, a, b);
400 }
401 }
402
403 if (type.floating) {
404 switch (nan_behavior) {
405 case GALLIVM_NAN_RETURN_NAN: {
406 LLVMValueRef isnan = lp_build_isnan(bld, b);
407 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
408 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
409 return lp_build_select(bld, cond, a, b);
410 }
411 break;
412 case GALLIVM_NAN_RETURN_OTHER: {
413 LLVMValueRef isnan = lp_build_isnan(bld, a);
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
416 return lp_build_select(bld, cond, a, b);
417 }
418 break;
419 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
420 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
421 return lp_build_select(bld, cond, a, b);
422 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
423 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
424 return lp_build_select(bld, cond, b, a);
425 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
426 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
427 return lp_build_select(bld, cond, a, b);
428 break;
429 default:
430 assert(0);
431 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
432 return lp_build_select(bld, cond, a, b);
433 }
434 } else {
435 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
436 return lp_build_select(bld, cond, a, b);
437 }
438 }
439
440
441 /**
442 * Generate 1 - a, or ~a depending on bld->type.
443 */
444 LLVMValueRef
445 lp_build_comp(struct lp_build_context *bld,
446 LLVMValueRef a)
447 {
448 LLVMBuilderRef builder = bld->gallivm->builder;
449 const struct lp_type type = bld->type;
450
451 assert(lp_check_value(type, a));
452
453 if(a == bld->one)
454 return bld->zero;
455 if(a == bld->zero)
456 return bld->one;
457
458 if(type.norm && !type.floating && !type.fixed && !type.sign) {
459 if(LLVMIsConstant(a))
460 return LLVMConstNot(a);
461 else
462 return LLVMBuildNot(builder, a, "");
463 }
464
465 if(LLVMIsConstant(a))
466 if (type.floating)
467 return LLVMConstFSub(bld->one, a);
468 else
469 return LLVMConstSub(bld->one, a);
470 else
471 if (type.floating)
472 return LLVMBuildFSub(builder, bld->one, a, "");
473 else
474 return LLVMBuildSub(builder, bld->one, a, "");
475 }
476
477
478 /**
479 * Generate a + b
480 */
481 LLVMValueRef
482 lp_build_add(struct lp_build_context *bld,
483 LLVMValueRef a,
484 LLVMValueRef b)
485 {
486 LLVMBuilderRef builder = bld->gallivm->builder;
487 const struct lp_type type = bld->type;
488 LLVMValueRef res;
489
490 assert(lp_check_value(type, a));
491 assert(lp_check_value(type, b));
492
493 if(a == bld->zero)
494 return b;
495 if(b == bld->zero)
496 return a;
497 if(a == bld->undef || b == bld->undef)
498 return bld->undef;
499
500 if(bld->type.norm) {
501 const char *intrinsic = NULL;
502
503 if(a == bld->one || b == bld->one)
504 return bld->one;
505
506 if (type.width * type.length == 128 &&
507 !type.floating && !type.fixed) {
508 if(util_cpu_caps.has_sse2) {
509 if(type.width == 8)
510 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
511 if(type.width == 16)
512 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
513 } else if (util_cpu_caps.has_altivec) {
514 if(type.width == 8)
515 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
516 if(type.width == 16)
517 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
518 }
519 }
520
521 if (intrinsic)
522 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
523 }
524
525 if(type.norm && !type.floating && !type.fixed) {
526 if (type.sign) {
527 uint64_t sign = (uint64_t)1 << (type.width - 1);
528 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
529 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
530 /* a_clamp_max is the maximum a for positive b,
531 a_clamp_min is the minimum a for negative b. */
532 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
533 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
534 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
535 } else {
536 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
537 }
538 }
539
540 if(LLVMIsConstant(a) && LLVMIsConstant(b))
541 if (type.floating)
542 res = LLVMConstFAdd(a, b);
543 else
544 res = LLVMConstAdd(a, b);
545 else
546 if (type.floating)
547 res = LLVMBuildFAdd(builder, a, b, "");
548 else
549 res = LLVMBuildAdd(builder, a, b, "");
550
551 /* clamp to ceiling of 1.0 */
552 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
553 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
554
555 /* XXX clamp to floor of -1 or 0??? */
556
557 return res;
558 }
559
560
561 /** Return the scalar sum of the elements of a.
562 * Should avoid this operation whenever possible.
563 */
564 LLVMValueRef
565 lp_build_horizontal_add(struct lp_build_context *bld,
566 LLVMValueRef a)
567 {
568 LLVMBuilderRef builder = bld->gallivm->builder;
569 const struct lp_type type = bld->type;
570 LLVMValueRef index, res;
571 unsigned i, length;
572 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
573 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
574 LLVMValueRef vecres, elem2;
575
576 assert(lp_check_value(type, a));
577
578 if (type.length == 1) {
579 return a;
580 }
581
582 assert(!bld->type.norm);
583
584 /*
585 * for byte vectors can do much better with psadbw.
586 * Using repeated shuffle/adds here. Note with multiple vectors
587 * this can be done more efficiently as outlined in the intel
588 * optimization manual.
589 * Note: could cause data rearrangement if used with smaller element
590 * sizes.
591 */
592
593 vecres = a;
594 length = type.length / 2;
595 while (length > 1) {
596 LLVMValueRef vec1, vec2;
597 for (i = 0; i < length; i++) {
598 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
599 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
600 }
601 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
602 LLVMConstVector(shuffles1, length), "");
603 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
604 LLVMConstVector(shuffles2, length), "");
605 if (type.floating) {
606 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
607 }
608 else {
609 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
610 }
611 length = length >> 1;
612 }
613
614 /* always have vector of size 2 here */
615 assert(length == 1);
616
617 index = lp_build_const_int32(bld->gallivm, 0);
618 res = LLVMBuildExtractElement(builder, vecres, index, "");
619 index = lp_build_const_int32(bld->gallivm, 1);
620 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
621
622 if (type.floating)
623 res = LLVMBuildFAdd(builder, res, elem2, "");
624 else
625 res = LLVMBuildAdd(builder, res, elem2, "");
626
627 return res;
628 }
629
630 /**
631 * Return the horizontal sums of 4 float vectors as a float4 vector.
632 * This uses the technique as outlined in Intel Optimization Manual.
633 */
634 static LLVMValueRef
635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
636 LLVMValueRef src[4])
637 {
638 struct gallivm_state *gallivm = bld->gallivm;
639 LLVMBuilderRef builder = gallivm->builder;
640 LLVMValueRef shuffles[4];
641 LLVMValueRef tmp[4];
642 LLVMValueRef sumtmp[2], shuftmp[2];
643
644 /* lower half of regs */
645 shuffles[0] = lp_build_const_int32(gallivm, 0);
646 shuffles[1] = lp_build_const_int32(gallivm, 1);
647 shuffles[2] = lp_build_const_int32(gallivm, 4);
648 shuffles[3] = lp_build_const_int32(gallivm, 5);
649 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
650 LLVMConstVector(shuffles, 4), "");
651 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
652 LLVMConstVector(shuffles, 4), "");
653
654 /* upper half of regs */
655 shuffles[0] = lp_build_const_int32(gallivm, 2);
656 shuffles[1] = lp_build_const_int32(gallivm, 3);
657 shuffles[2] = lp_build_const_int32(gallivm, 6);
658 shuffles[3] = lp_build_const_int32(gallivm, 7);
659 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
660 LLVMConstVector(shuffles, 4), "");
661 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
662 LLVMConstVector(shuffles, 4), "");
663
664 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
665 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
666
667 shuffles[0] = lp_build_const_int32(gallivm, 0);
668 shuffles[1] = lp_build_const_int32(gallivm, 2);
669 shuffles[2] = lp_build_const_int32(gallivm, 4);
670 shuffles[3] = lp_build_const_int32(gallivm, 6);
671 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672 LLVMConstVector(shuffles, 4), "");
673
674 shuffles[0] = lp_build_const_int32(gallivm, 1);
675 shuffles[1] = lp_build_const_int32(gallivm, 3);
676 shuffles[2] = lp_build_const_int32(gallivm, 5);
677 shuffles[3] = lp_build_const_int32(gallivm, 7);
678 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
679 LLVMConstVector(shuffles, 4), "");
680
681 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
682 }
683
684
685 /*
686 * partially horizontally add 2-4 float vectors with length nx4,
687 * i.e. only four adjacent values in each vector will be added,
688 * assuming values are really grouped in 4 which also determines
689 * output order.
690 *
691 * Return a vector of the same length as the initial vectors,
692 * with the excess elements (if any) being undefined.
693 * The element order is independent of number of input vectors.
694 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
695 * the output order thus will be
696 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
697 */
698 LLVMValueRef
699 lp_build_hadd_partial4(struct lp_build_context *bld,
700 LLVMValueRef vectors[],
701 unsigned num_vecs)
702 {
703 struct gallivm_state *gallivm = bld->gallivm;
704 LLVMBuilderRef builder = gallivm->builder;
705 LLVMValueRef ret_vec;
706 LLVMValueRef tmp[4];
707 const char *intrinsic = NULL;
708
709 assert(num_vecs >= 2 && num_vecs <= 4);
710 assert(bld->type.floating);
711
712 /* only use this with at least 2 vectors, as it is sort of expensive
713 * (depending on cpu) and we always need two horizontal adds anyway,
714 * so a shuffle/add approach might be better.
715 */
716
717 tmp[0] = vectors[0];
718 tmp[1] = vectors[1];
719
720 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
721 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
722
723 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
724 bld->type.length == 4) {
725 intrinsic = "llvm.x86.sse3.hadd.ps";
726 }
727 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
728 bld->type.length == 8) {
729 intrinsic = "llvm.x86.avx.hadd.ps.256";
730 }
731 if (intrinsic) {
732 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
733 lp_build_vec_type(gallivm, bld->type),
734 tmp[0], tmp[1]);
735 if (num_vecs > 2) {
736 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
737 lp_build_vec_type(gallivm, bld->type),
738 tmp[2], tmp[3]);
739 }
740 else {
741 tmp[1] = tmp[0];
742 }
743 return lp_build_intrinsic_binary(builder, intrinsic,
744 lp_build_vec_type(gallivm, bld->type),
745 tmp[0], tmp[1]);
746 }
747
748 if (bld->type.length == 4) {
749 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
750 }
751 else {
752 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
753 unsigned j;
754 unsigned num_iter = bld->type.length / 4;
755 struct lp_type parttype = bld->type;
756 parttype.length = 4;
757 for (j = 0; j < num_iter; j++) {
758 LLVMValueRef partsrc[4];
759 unsigned i;
760 for (i = 0; i < 4; i++) {
761 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
762 }
763 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
764 }
765 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
766 }
767 return ret_vec;
768 }
769
770 /**
771 * Generate a - b
772 */
773 LLVMValueRef
774 lp_build_sub(struct lp_build_context *bld,
775 LLVMValueRef a,
776 LLVMValueRef b)
777 {
778 LLVMBuilderRef builder = bld->gallivm->builder;
779 const struct lp_type type = bld->type;
780 LLVMValueRef res;
781
782 assert(lp_check_value(type, a));
783 assert(lp_check_value(type, b));
784
785 if(b == bld->zero)
786 return a;
787 if(a == bld->undef || b == bld->undef)
788 return bld->undef;
789 if(a == b)
790 return bld->zero;
791
792 if(bld->type.norm) {
793 const char *intrinsic = NULL;
794
795 if(b == bld->one)
796 return bld->zero;
797
798 if (type.width * type.length == 128 &&
799 !type.floating && !type.fixed) {
800 if (util_cpu_caps.has_sse2) {
801 if(type.width == 8)
802 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
803 if(type.width == 16)
804 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
805 } else if (util_cpu_caps.has_altivec) {
806 if(type.width == 8)
807 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
808 if(type.width == 16)
809 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
810 }
811 }
812
813 if (intrinsic)
814 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
815 }
816
817 if(type.norm && !type.floating && !type.fixed) {
818 if (type.sign) {
819 uint64_t sign = (uint64_t)1 << (type.width - 1);
820 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
821 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
822 /* a_clamp_max is the maximum a for negative b,
823 a_clamp_min is the minimum a for positive b. */
824 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
825 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
826 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
827 } else {
828 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
829 }
830 }
831
832 if(LLVMIsConstant(a) && LLVMIsConstant(b))
833 if (type.floating)
834 res = LLVMConstFSub(a, b);
835 else
836 res = LLVMConstSub(a, b);
837 else
838 if (type.floating)
839 res = LLVMBuildFSub(builder, a, b, "");
840 else
841 res = LLVMBuildSub(builder, a, b, "");
842
843 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
844 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
845
846 return res;
847 }
848
849
850
851 /**
852 * Normalized multiplication.
853 *
854 * There are several approaches for (using 8-bit normalized multiplication as
855 * an example):
856 *
857 * - alpha plus one
858 *
859 * makes the following approximation to the division (Sree)
860 *
861 * a*b/255 ~= (a*(b + 1)) >> 256
862 *
863 * which is the fastest method that satisfies the following OpenGL criteria of
864 *
865 * 0*0 = 0 and 255*255 = 255
866 *
867 * - geometric series
868 *
869 * takes the geometric series approximation to the division
870 *
871 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
872 *
873 * in this case just the first two terms to fit in 16bit arithmetic
874 *
875 * t/255 ~= (t + (t >> 8)) >> 8
876 *
877 * note that just by itself it doesn't satisfies the OpenGL criteria, as
878 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
879 * must be used.
880 *
881 * - geometric series plus rounding
882 *
883 * when using a geometric series division instead of truncating the result
884 * use roundoff in the approximation (Jim Blinn)
885 *
886 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
887 *
888 * achieving the exact results.
889 *
890 *
891 *
892 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
893 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
894 * @sa Michael Herf, The "double blend trick", May 2000,
895 * http://www.stereopsis.com/doubleblend.html
896 */
897 static LLVMValueRef
898 lp_build_mul_norm(struct gallivm_state *gallivm,
899 struct lp_type wide_type,
900 LLVMValueRef a, LLVMValueRef b)
901 {
902 LLVMBuilderRef builder = gallivm->builder;
903 struct lp_build_context bld;
904 unsigned n;
905 LLVMValueRef half;
906 LLVMValueRef ab;
907
908 assert(!wide_type.floating);
909 assert(lp_check_value(wide_type, a));
910 assert(lp_check_value(wide_type, b));
911
912 lp_build_context_init(&bld, gallivm, wide_type);
913
914 n = wide_type.width / 2;
915 if (wide_type.sign) {
916 --n;
917 }
918
919 /*
920 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
921 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
922 */
923
924 /*
925 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
926 */
927
928 ab = LLVMBuildMul(builder, a, b, "");
929 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
930
931 /*
932 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
933 */
934
935 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
936 if (wide_type.sign) {
937 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
938 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
939 half = lp_build_select(&bld, sign, minus_half, half);
940 }
941 ab = LLVMBuildAdd(builder, ab, half, "");
942
943 /* Final division */
944 ab = lp_build_shr_imm(&bld, ab, n);
945
946 return ab;
947 }
948
949 /**
950 * Generate a * b
951 */
952 LLVMValueRef
953 lp_build_mul(struct lp_build_context *bld,
954 LLVMValueRef a,
955 LLVMValueRef b)
956 {
957 LLVMBuilderRef builder = bld->gallivm->builder;
958 const struct lp_type type = bld->type;
959 LLVMValueRef shift;
960 LLVMValueRef res;
961
962 assert(lp_check_value(type, a));
963 assert(lp_check_value(type, b));
964
965 if(a == bld->zero)
966 return bld->zero;
967 if(a == bld->one)
968 return b;
969 if(b == bld->zero)
970 return bld->zero;
971 if(b == bld->one)
972 return a;
973 if(a == bld->undef || b == bld->undef)
974 return bld->undef;
975
976 if (!type.floating && !type.fixed && type.norm) {
977 struct lp_type wide_type = lp_wider_type(type);
978 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
979
980 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
981 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
982
983 /* PMULLW, PSRLW, PADDW */
984 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
985 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
986
987 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
988
989 return ab;
990 }
991
992 if(type.fixed)
993 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
994 else
995 shift = NULL;
996
997 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
998 if (type.floating)
999 res = LLVMConstFMul(a, b);
1000 else
1001 res = LLVMConstMul(a, b);
1002 if(shift) {
1003 if(type.sign)
1004 res = LLVMConstAShr(res, shift);
1005 else
1006 res = LLVMConstLShr(res, shift);
1007 }
1008 }
1009 else {
1010 if (type.floating)
1011 res = LLVMBuildFMul(builder, a, b, "");
1012 else
1013 res = LLVMBuildMul(builder, a, b, "");
1014 if(shift) {
1015 if(type.sign)
1016 res = LLVMBuildAShr(builder, res, shift, "");
1017 else
1018 res = LLVMBuildLShr(builder, res, shift, "");
1019 }
1020 }
1021
1022 return res;
1023 }
1024
1025
1026 /**
1027 * Small vector x scale multiplication optimization.
1028 */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031 LLVMValueRef a,
1032 int b)
1033 {
1034 LLVMBuilderRef builder = bld->gallivm->builder;
1035 LLVMValueRef factor;
1036
1037 assert(lp_check_value(bld->type, a));
1038
1039 if(b == 0)
1040 return bld->zero;
1041
1042 if(b == 1)
1043 return a;
1044
1045 if(b == -1)
1046 return lp_build_negate(bld, a);
1047
1048 if(b == 2 && bld->type.floating)
1049 return lp_build_add(bld, a, a);
1050
1051 if(util_is_power_of_two(b)) {
1052 unsigned shift = ffs(b) - 1;
1053
1054 if(bld->type.floating) {
1055 #if 0
1056 /*
1057 * Power of two multiplication by directly manipulating the exponent.
1058 *
1059 * XXX: This might not be always faster, it will introduce a small error
1060 * for multiplication by zero, and it will produce wrong results
1061 * for Inf and NaN.
1062 */
1063 unsigned mantissa = lp_mantissa(bld->type);
1064 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066 a = LLVMBuildAdd(builder, a, factor, "");
1067 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068 return a;
1069 #endif
1070 }
1071 else {
1072 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073 return LLVMBuildShl(builder, a, factor, "");
1074 }
1075 }
1076
1077 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078 return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083 * Generate a / b
1084 */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087 LLVMValueRef a,
1088 LLVMValueRef b)
1089 {
1090 LLVMBuilderRef builder = bld->gallivm->builder;
1091 const struct lp_type type = bld->type;
1092
1093 assert(lp_check_value(type, a));
1094 assert(lp_check_value(type, b));
1095
1096 if(a == bld->zero)
1097 return bld->zero;
1098 if(a == bld->one && type.floating)
1099 return lp_build_rcp(bld, b);
1100 if(b == bld->zero)
1101 return bld->undef;
1102 if(b == bld->one)
1103 return a;
1104 if(a == bld->undef || b == bld->undef)
1105 return bld->undef;
1106
1107 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108 if (type.floating)
1109 return LLVMConstFDiv(a, b);
1110 else if (type.sign)
1111 return LLVMConstSDiv(a, b);
1112 else
1113 return LLVMConstUDiv(a, b);
1114 }
1115
1116 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118 type.floating)
1119 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121 if (type.floating)
1122 return LLVMBuildFDiv(builder, a, b, "");
1123 else if (type.sign)
1124 return LLVMBuildSDiv(builder, a, b, "");
1125 else
1126 return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131 * Linear interpolation helper.
1132 *
1133 * @param normalized whether we are interpolating normalized values,
1134 * encoded in normalized integers, twice as wide.
1135 *
1136 * @sa http://www.stereopsis.com/doubleblend.html
1137 */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140 LLVMValueRef x,
1141 LLVMValueRef v0,
1142 LLVMValueRef v1,
1143 unsigned flags)
1144 {
1145 unsigned half_width = bld->type.width/2;
1146 LLVMBuilderRef builder = bld->gallivm->builder;
1147 LLVMValueRef delta;
1148 LLVMValueRef res;
1149
1150 assert(lp_check_value(bld->type, x));
1151 assert(lp_check_value(bld->type, v0));
1152 assert(lp_check_value(bld->type, v1));
1153
1154 delta = lp_build_sub(bld, v1, v0);
1155
1156 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157 if (!bld->type.sign) {
1158 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159 /*
1160 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161 * most-significant-bit to the lowest-significant-bit, so that
1162 * later we can just divide by 2**n instead of 2**n - 1.
1163 */
1164
1165 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166 }
1167
1168 /* (x * delta) >> n */
1169 res = lp_build_mul(bld, x, delta);
1170 res = lp_build_shr_imm(bld, res, half_width);
1171 } else {
1172 /*
1173 * The rescaling trick above doesn't work for signed numbers, so
1174 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175 * instead.
1176 */
1177 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179 }
1180 } else {
1181 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182 res = lp_build_mul(bld, x, delta);
1183 }
1184
1185 res = lp_build_add(bld, v0, res);
1186
1187 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1188 bld->type.fixed) {
1189 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1190 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1191 * but it will be wrong for true fixed point use cases. Basically we need
1192 * a more powerful lp_type, capable of further distinguishing the values
1193 * interpretation from the value storage. */
1194 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1195 }
1196
1197 return res;
1198 }
1199
1200
1201 /**
1202 * Linear interpolation.
1203 */
1204 LLVMValueRef
1205 lp_build_lerp(struct lp_build_context *bld,
1206 LLVMValueRef x,
1207 LLVMValueRef v0,
1208 LLVMValueRef v1,
1209 unsigned flags)
1210 {
1211 const struct lp_type type = bld->type;
1212 LLVMValueRef res;
1213
1214 assert(lp_check_value(type, x));
1215 assert(lp_check_value(type, v0));
1216 assert(lp_check_value(type, v1));
1217
1218 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1219
1220 if (type.norm) {
1221 struct lp_type wide_type;
1222 struct lp_build_context wide_bld;
1223 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1224
1225 assert(type.length >= 2);
1226
1227 /*
1228 * Create a wider integer type, enough to hold the
1229 * intermediate result of the multiplication.
1230 */
1231 memset(&wide_type, 0, sizeof wide_type);
1232 wide_type.sign = type.sign;
1233 wide_type.width = type.width*2;
1234 wide_type.length = type.length/2;
1235
1236 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1237
1238 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1239 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1240 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1241
1242 /*
1243 * Lerp both halves.
1244 */
1245
1246 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1247
1248 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1249 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1250
1251 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1252 } else {
1253 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1254 }
1255
1256 return res;
1257 }
1258
1259
1260 /**
1261 * Bilinear interpolation.
1262 *
1263 * Values indices are in v_{yx}.
1264 */
1265 LLVMValueRef
1266 lp_build_lerp_2d(struct lp_build_context *bld,
1267 LLVMValueRef x,
1268 LLVMValueRef y,
1269 LLVMValueRef v00,
1270 LLVMValueRef v01,
1271 LLVMValueRef v10,
1272 LLVMValueRef v11,
1273 unsigned flags)
1274 {
1275 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1276 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1277 return lp_build_lerp(bld, y, v0, v1, flags);
1278 }
1279
1280
1281 LLVMValueRef
1282 lp_build_lerp_3d(struct lp_build_context *bld,
1283 LLVMValueRef x,
1284 LLVMValueRef y,
1285 LLVMValueRef z,
1286 LLVMValueRef v000,
1287 LLVMValueRef v001,
1288 LLVMValueRef v010,
1289 LLVMValueRef v011,
1290 LLVMValueRef v100,
1291 LLVMValueRef v101,
1292 LLVMValueRef v110,
1293 LLVMValueRef v111,
1294 unsigned flags)
1295 {
1296 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1297 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1298 return lp_build_lerp(bld, z, v0, v1, flags);
1299 }
1300
1301
1302 /**
1303 * Generate min(a, b)
1304 * Do checks for special cases but not for nans.
1305 */
1306 LLVMValueRef
1307 lp_build_min(struct lp_build_context *bld,
1308 LLVMValueRef a,
1309 LLVMValueRef b)
1310 {
1311 assert(lp_check_value(bld->type, a));
1312 assert(lp_check_value(bld->type, b));
1313
1314 if(a == bld->undef || b == bld->undef)
1315 return bld->undef;
1316
1317 if(a == b)
1318 return a;
1319
1320 if (bld->type.norm) {
1321 if (!bld->type.sign) {
1322 if (a == bld->zero || b == bld->zero) {
1323 return bld->zero;
1324 }
1325 }
1326 if(a == bld->one)
1327 return b;
1328 if(b == bld->one)
1329 return a;
1330 }
1331
1332 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1333 }
1334
1335
1336 /**
1337 * Generate min(a, b)
1338 * NaN's are handled according to the behavior specified by the
1339 * nan_behavior argument.
1340 */
1341 LLVMValueRef
1342 lp_build_min_ext(struct lp_build_context *bld,
1343 LLVMValueRef a,
1344 LLVMValueRef b,
1345 enum gallivm_nan_behavior nan_behavior)
1346 {
1347 assert(lp_check_value(bld->type, a));
1348 assert(lp_check_value(bld->type, b));
1349
1350 if(a == bld->undef || b == bld->undef)
1351 return bld->undef;
1352
1353 if(a == b)
1354 return a;
1355
1356 if (bld->type.norm) {
1357 if (!bld->type.sign) {
1358 if (a == bld->zero || b == bld->zero) {
1359 return bld->zero;
1360 }
1361 }
1362 if(a == bld->one)
1363 return b;
1364 if(b == bld->one)
1365 return a;
1366 }
1367
1368 return lp_build_min_simple(bld, a, b, nan_behavior);
1369 }
1370
1371 /**
1372 * Generate max(a, b)
1373 * Do checks for special cases, but NaN behavior is undefined.
1374 */
1375 LLVMValueRef
1376 lp_build_max(struct lp_build_context *bld,
1377 LLVMValueRef a,
1378 LLVMValueRef b)
1379 {
1380 assert(lp_check_value(bld->type, a));
1381 assert(lp_check_value(bld->type, b));
1382
1383 if(a == bld->undef || b == bld->undef)
1384 return bld->undef;
1385
1386 if(a == b)
1387 return a;
1388
1389 if(bld->type.norm) {
1390 if(a == bld->one || b == bld->one)
1391 return bld->one;
1392 if (!bld->type.sign) {
1393 if (a == bld->zero) {
1394 return b;
1395 }
1396 if (b == bld->zero) {
1397 return a;
1398 }
1399 }
1400 }
1401
1402 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1403 }
1404
1405
1406 /**
1407 * Generate max(a, b)
1408 * Checks for special cases.
1409 * NaN's are handled according to the behavior specified by the
1410 * nan_behavior argument.
1411 */
1412 LLVMValueRef
1413 lp_build_max_ext(struct lp_build_context *bld,
1414 LLVMValueRef a,
1415 LLVMValueRef b,
1416 enum gallivm_nan_behavior nan_behavior)
1417 {
1418 assert(lp_check_value(bld->type, a));
1419 assert(lp_check_value(bld->type, b));
1420
1421 if(a == bld->undef || b == bld->undef)
1422 return bld->undef;
1423
1424 if(a == b)
1425 return a;
1426
1427 if(bld->type.norm) {
1428 if(a == bld->one || b == bld->one)
1429 return bld->one;
1430 if (!bld->type.sign) {
1431 if (a == bld->zero) {
1432 return b;
1433 }
1434 if (b == bld->zero) {
1435 return a;
1436 }
1437 }
1438 }
1439
1440 return lp_build_max_simple(bld, a, b, nan_behavior);
1441 }
1442
1443 /**
1444 * Generate clamp(a, min, max)
1445 * NaN behavior (for any of a, min, max) is undefined.
1446 * Do checks for special cases.
1447 */
1448 LLVMValueRef
1449 lp_build_clamp(struct lp_build_context *bld,
1450 LLVMValueRef a,
1451 LLVMValueRef min,
1452 LLVMValueRef max)
1453 {
1454 assert(lp_check_value(bld->type, a));
1455 assert(lp_check_value(bld->type, min));
1456 assert(lp_check_value(bld->type, max));
1457
1458 a = lp_build_min(bld, a, max);
1459 a = lp_build_max(bld, a, min);
1460 return a;
1461 }
1462
1463
1464 /**
1465 * Generate clamp(a, 0, 1)
1466 * A NaN will get converted to zero.
1467 */
1468 LLVMValueRef
1469 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1470 LLVMValueRef a)
1471 {
1472 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1473 a = lp_build_min(bld, a, bld->one);
1474 return a;
1475 }
1476
1477
1478 /**
1479 * Generate abs(a)
1480 */
1481 LLVMValueRef
1482 lp_build_abs(struct lp_build_context *bld,
1483 LLVMValueRef a)
1484 {
1485 LLVMBuilderRef builder = bld->gallivm->builder;
1486 const struct lp_type type = bld->type;
1487 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1488
1489 assert(lp_check_value(type, a));
1490
1491 if(!type.sign)
1492 return a;
1493
1494 if(type.floating) {
1495 char intrinsic[32];
1496 util_snprintf(intrinsic, sizeof intrinsic, "llvm.fabs.v%uf%u", type.length, type.width);
1497 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1498 }
1499
1500 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1501 switch(type.width) {
1502 case 8:
1503 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1504 case 16:
1505 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1506 case 32:
1507 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1508 }
1509 }
1510 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1511 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1512 (type.width == 8 || type.width == 16 || type.width == 32)) {
1513 debug_printf("%s: inefficient code, should split vectors manually\n",
1514 __FUNCTION__);
1515 }
1516
1517 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1518 }
1519
1520
1521 LLVMValueRef
1522 lp_build_negate(struct lp_build_context *bld,
1523 LLVMValueRef a)
1524 {
1525 LLVMBuilderRef builder = bld->gallivm->builder;
1526
1527 assert(lp_check_value(bld->type, a));
1528
1529 if (bld->type.floating)
1530 a = LLVMBuildFNeg(builder, a, "");
1531 else
1532 a = LLVMBuildNeg(builder, a, "");
1533
1534 return a;
1535 }
1536
1537
1538 /** Return -1, 0 or +1 depending on the sign of a */
1539 LLVMValueRef
1540 lp_build_sgn(struct lp_build_context *bld,
1541 LLVMValueRef a)
1542 {
1543 LLVMBuilderRef builder = bld->gallivm->builder;
1544 const struct lp_type type = bld->type;
1545 LLVMValueRef cond;
1546 LLVMValueRef res;
1547
1548 assert(lp_check_value(type, a));
1549
1550 /* Handle non-zero case */
1551 if(!type.sign) {
1552 /* if not zero then sign must be positive */
1553 res = bld->one;
1554 }
1555 else if(type.floating) {
1556 LLVMTypeRef vec_type;
1557 LLVMTypeRef int_type;
1558 LLVMValueRef mask;
1559 LLVMValueRef sign;
1560 LLVMValueRef one;
1561 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1562
1563 int_type = lp_build_int_vec_type(bld->gallivm, type);
1564 vec_type = lp_build_vec_type(bld->gallivm, type);
1565 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1566
1567 /* Take the sign bit and add it to 1 constant */
1568 sign = LLVMBuildBitCast(builder, a, int_type, "");
1569 sign = LLVMBuildAnd(builder, sign, mask, "");
1570 one = LLVMConstBitCast(bld->one, int_type);
1571 res = LLVMBuildOr(builder, sign, one, "");
1572 res = LLVMBuildBitCast(builder, res, vec_type, "");
1573 }
1574 else
1575 {
1576 /* signed int/norm/fixed point */
1577 /* could use psign with sse3 and appropriate vectors here */
1578 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1579 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1580 res = lp_build_select(bld, cond, bld->one, minus_one);
1581 }
1582
1583 /* Handle zero */
1584 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1585 res = lp_build_select(bld, cond, bld->zero, res);
1586
1587 return res;
1588 }
1589
1590
1591 /**
1592 * Set the sign of float vector 'a' according to 'sign'.
1593 * If sign==0, return abs(a).
1594 * If sign==1, return -abs(a);
1595 * Other values for sign produce undefined results.
1596 */
1597 LLVMValueRef
1598 lp_build_set_sign(struct lp_build_context *bld,
1599 LLVMValueRef a, LLVMValueRef sign)
1600 {
1601 LLVMBuilderRef builder = bld->gallivm->builder;
1602 const struct lp_type type = bld->type;
1603 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1604 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1605 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1606 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1607 ~((unsigned long long) 1 << (type.width - 1)));
1608 LLVMValueRef val, res;
1609
1610 assert(type.floating);
1611 assert(lp_check_value(type, a));
1612
1613 /* val = reinterpret_cast<int>(a) */
1614 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1615 /* val = val & mask */
1616 val = LLVMBuildAnd(builder, val, mask, "");
1617 /* sign = sign << shift */
1618 sign = LLVMBuildShl(builder, sign, shift, "");
1619 /* res = val | sign */
1620 res = LLVMBuildOr(builder, val, sign, "");
1621 /* res = reinterpret_cast<float>(res) */
1622 res = LLVMBuildBitCast(builder, res, vec_type, "");
1623
1624 return res;
1625 }
1626
1627
1628 /**
1629 * Convert vector of (or scalar) int to vector of (or scalar) float.
1630 */
1631 LLVMValueRef
1632 lp_build_int_to_float(struct lp_build_context *bld,
1633 LLVMValueRef a)
1634 {
1635 LLVMBuilderRef builder = bld->gallivm->builder;
1636 const struct lp_type type = bld->type;
1637 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1638
1639 assert(type.floating);
1640
1641 return LLVMBuildSIToFP(builder, a, vec_type, "");
1642 }
1643
1644 static boolean
1645 arch_rounding_available(const struct lp_type type)
1646 {
1647 if ((util_cpu_caps.has_sse4_1 &&
1648 (type.length == 1 || type.width*type.length == 128)) ||
1649 (util_cpu_caps.has_avx && type.width*type.length == 256))
1650 return TRUE;
1651 else if ((util_cpu_caps.has_altivec &&
1652 (type.width == 32 && type.length == 4)))
1653 return TRUE;
1654
1655 return FALSE;
1656 }
1657
1658 enum lp_build_round_mode
1659 {
1660 LP_BUILD_ROUND_NEAREST = 0,
1661 LP_BUILD_ROUND_FLOOR = 1,
1662 LP_BUILD_ROUND_CEIL = 2,
1663 LP_BUILD_ROUND_TRUNCATE = 3
1664 };
1665
1666 /**
1667 * Helper for SSE4.1's ROUNDxx instructions.
1668 *
1669 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1670 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1671 */
1672 static inline LLVMValueRef
1673 lp_build_nearest_sse41(struct lp_build_context *bld,
1674 LLVMValueRef a)
1675 {
1676 LLVMBuilderRef builder = bld->gallivm->builder;
1677 const struct lp_type type = bld->type;
1678 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1679 LLVMValueRef mode = LLVMConstNull(i32t);
1680 const char *intrinsic;
1681 LLVMValueRef res;
1682
1683 assert(type.floating);
1684
1685 assert(lp_check_value(type, a));
1686 assert(util_cpu_caps.has_sse4_1);
1687
1688 if (type.length == 1) {
1689 LLVMTypeRef vec_type;
1690 LLVMValueRef undef;
1691 LLVMValueRef args[3];
1692 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1693
1694 switch(type.width) {
1695 case 32:
1696 intrinsic = "llvm.x86.sse41.round.ss";
1697 break;
1698 case 64:
1699 intrinsic = "llvm.x86.sse41.round.sd";
1700 break;
1701 default:
1702 assert(0);
1703 return bld->undef;
1704 }
1705
1706 vec_type = LLVMVectorType(bld->elem_type, 4);
1707
1708 undef = LLVMGetUndef(vec_type);
1709
1710 args[0] = undef;
1711 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1712 args[2] = mode;
1713
1714 res = lp_build_intrinsic(builder, intrinsic,
1715 vec_type, args, Elements(args), 0);
1716
1717 res = LLVMBuildExtractElement(builder, res, index0, "");
1718 }
1719 else {
1720 if (type.width * type.length == 128) {
1721 switch(type.width) {
1722 case 32:
1723 intrinsic = "llvm.x86.sse41.round.ps";
1724 break;
1725 case 64:
1726 intrinsic = "llvm.x86.sse41.round.pd";
1727 break;
1728 default:
1729 assert(0);
1730 return bld->undef;
1731 }
1732 }
1733 else {
1734 assert(type.width * type.length == 256);
1735 assert(util_cpu_caps.has_avx);
1736
1737 switch(type.width) {
1738 case 32:
1739 intrinsic = "llvm.x86.avx.round.ps.256";
1740 break;
1741 case 64:
1742 intrinsic = "llvm.x86.avx.round.pd.256";
1743 break;
1744 default:
1745 assert(0);
1746 return bld->undef;
1747 }
1748 }
1749
1750 res = lp_build_intrinsic_binary(builder, intrinsic,
1751 bld->vec_type, a,
1752 mode);
1753 }
1754
1755 return res;
1756 }
1757
1758
1759 static inline LLVMValueRef
1760 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1761 LLVMValueRef a)
1762 {
1763 LLVMBuilderRef builder = bld->gallivm->builder;
1764 const struct lp_type type = bld->type;
1765 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1766 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1767 const char *intrinsic;
1768 LLVMValueRef res;
1769
1770 assert(type.floating);
1771 /* using the double precision conversions is a bit more complicated */
1772 assert(type.width == 32);
1773
1774 assert(lp_check_value(type, a));
1775 assert(util_cpu_caps.has_sse2);
1776
1777 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1778 if (type.length == 1) {
1779 LLVMTypeRef vec_type;
1780 LLVMValueRef undef;
1781 LLVMValueRef arg;
1782 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1783
1784 vec_type = LLVMVectorType(bld->elem_type, 4);
1785
1786 intrinsic = "llvm.x86.sse.cvtss2si";
1787
1788 undef = LLVMGetUndef(vec_type);
1789
1790 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1791
1792 res = lp_build_intrinsic_unary(builder, intrinsic,
1793 ret_type, arg);
1794 }
1795 else {
1796 if (type.width* type.length == 128) {
1797 intrinsic = "llvm.x86.sse2.cvtps2dq";
1798 }
1799 else {
1800 assert(type.width*type.length == 256);
1801 assert(util_cpu_caps.has_avx);
1802
1803 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1804 }
1805 res = lp_build_intrinsic_unary(builder, intrinsic,
1806 ret_type, a);
1807 }
1808
1809 return res;
1810 }
1811
1812
1813 /*
1814 */
1815 static inline LLVMValueRef
1816 lp_build_round_altivec(struct lp_build_context *bld,
1817 LLVMValueRef a,
1818 enum lp_build_round_mode mode)
1819 {
1820 LLVMBuilderRef builder = bld->gallivm->builder;
1821 const struct lp_type type = bld->type;
1822 const char *intrinsic = NULL;
1823
1824 assert(type.floating);
1825
1826 assert(lp_check_value(type, a));
1827 assert(util_cpu_caps.has_altivec);
1828
1829 (void)type;
1830
1831 switch (mode) {
1832 case LP_BUILD_ROUND_NEAREST:
1833 intrinsic = "llvm.ppc.altivec.vrfin";
1834 break;
1835 case LP_BUILD_ROUND_FLOOR:
1836 intrinsic = "llvm.ppc.altivec.vrfim";
1837 break;
1838 case LP_BUILD_ROUND_CEIL:
1839 intrinsic = "llvm.ppc.altivec.vrfip";
1840 break;
1841 case LP_BUILD_ROUND_TRUNCATE:
1842 intrinsic = "llvm.ppc.altivec.vrfiz";
1843 break;
1844 }
1845
1846 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1847 }
1848
1849 static inline LLVMValueRef
1850 lp_build_round_arch(struct lp_build_context *bld,
1851 LLVMValueRef a,
1852 enum lp_build_round_mode mode)
1853 {
1854 if (util_cpu_caps.has_sse4_1) {
1855 LLVMBuilderRef builder = bld->gallivm->builder;
1856 const struct lp_type type = bld->type;
1857 const char *intrinsic_root;
1858 char intrinsic[32];
1859
1860 assert(type.floating);
1861 assert(lp_check_value(type, a));
1862 (void)type;
1863
1864 switch (mode) {
1865 case LP_BUILD_ROUND_NEAREST:
1866 if (HAVE_LLVM >= 0x0304) {
1867 intrinsic_root = "llvm.round";
1868 } else {
1869 return lp_build_nearest_sse41(bld, a);
1870 }
1871 break;
1872 case LP_BUILD_ROUND_FLOOR:
1873 intrinsic_root = "llvm.floor";
1874 break;
1875 case LP_BUILD_ROUND_CEIL:
1876 intrinsic_root = "llvm.ceil";
1877 break;
1878 case LP_BUILD_ROUND_TRUNCATE:
1879 intrinsic_root = "llvm.trunc";
1880 break;
1881 }
1882
1883 util_snprintf(intrinsic, sizeof intrinsic, "%s.v%uf%u",
1884 intrinsic_root, type.length, type.width);
1885
1886 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1887 }
1888 else /* (util_cpu_caps.has_altivec) */
1889 return lp_build_round_altivec(bld, a, mode);
1890 }
1891
1892 /**
1893 * Return the integer part of a float (vector) value (== round toward zero).
1894 * The returned value is a float (vector).
1895 * Ex: trunc(-1.5) = -1.0
1896 */
1897 LLVMValueRef
1898 lp_build_trunc(struct lp_build_context *bld,
1899 LLVMValueRef a)
1900 {
1901 LLVMBuilderRef builder = bld->gallivm->builder;
1902 const struct lp_type type = bld->type;
1903
1904 assert(type.floating);
1905 assert(lp_check_value(type, a));
1906
1907 if (arch_rounding_available(type)) {
1908 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1909 }
1910 else {
1911 const struct lp_type type = bld->type;
1912 struct lp_type inttype;
1913 struct lp_build_context intbld;
1914 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1915 LLVMValueRef trunc, res, anosign, mask;
1916 LLVMTypeRef int_vec_type = bld->int_vec_type;
1917 LLVMTypeRef vec_type = bld->vec_type;
1918
1919 assert(type.width == 32); /* might want to handle doubles at some point */
1920
1921 inttype = type;
1922 inttype.floating = 0;
1923 lp_build_context_init(&intbld, bld->gallivm, inttype);
1924
1925 /* round by truncation */
1926 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1927 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1928
1929 /* mask out sign bit */
1930 anosign = lp_build_abs(bld, a);
1931 /*
1932 * mask out all values if anosign > 2^24
1933 * This should work both for large ints (all rounding is no-op for them
1934 * because such floats are always exact) as well as special cases like
1935 * NaNs, Infs (taking advantage of the fact they use max exponent).
1936 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1937 */
1938 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1939 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1940 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1941 return lp_build_select(bld, mask, a, res);
1942 }
1943 }
1944
1945
1946 /**
1947 * Return float (vector) rounded to nearest integer (vector). The returned
1948 * value is a float (vector).
1949 * Ex: round(0.9) = 1.0
1950 * Ex: round(-1.5) = -2.0
1951 */
1952 LLVMValueRef
1953 lp_build_round(struct lp_build_context *bld,
1954 LLVMValueRef a)
1955 {
1956 LLVMBuilderRef builder = bld->gallivm->builder;
1957 const struct lp_type type = bld->type;
1958
1959 assert(type.floating);
1960 assert(lp_check_value(type, a));
1961
1962 if (arch_rounding_available(type)) {
1963 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1964 }
1965 else {
1966 const struct lp_type type = bld->type;
1967 struct lp_type inttype;
1968 struct lp_build_context intbld;
1969 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1970 LLVMValueRef res, anosign, mask;
1971 LLVMTypeRef int_vec_type = bld->int_vec_type;
1972 LLVMTypeRef vec_type = bld->vec_type;
1973
1974 assert(type.width == 32); /* might want to handle doubles at some point */
1975
1976 inttype = type;
1977 inttype.floating = 0;
1978 lp_build_context_init(&intbld, bld->gallivm, inttype);
1979
1980 res = lp_build_iround(bld, a);
1981 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1982
1983 /* mask out sign bit */
1984 anosign = lp_build_abs(bld, a);
1985 /*
1986 * mask out all values if anosign > 2^24
1987 * This should work both for large ints (all rounding is no-op for them
1988 * because such floats are always exact) as well as special cases like
1989 * NaNs, Infs (taking advantage of the fact they use max exponent).
1990 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1991 */
1992 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1993 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1994 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1995 return lp_build_select(bld, mask, a, res);
1996 }
1997 }
1998
1999
2000 /**
2001 * Return floor of float (vector), result is a float (vector)
2002 * Ex: floor(1.1) = 1.0
2003 * Ex: floor(-1.1) = -2.0
2004 */
2005 LLVMValueRef
2006 lp_build_floor(struct lp_build_context *bld,
2007 LLVMValueRef a)
2008 {
2009 LLVMBuilderRef builder = bld->gallivm->builder;
2010 const struct lp_type type = bld->type;
2011
2012 assert(type.floating);
2013 assert(lp_check_value(type, a));
2014
2015 if (arch_rounding_available(type)) {
2016 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2017 }
2018 else {
2019 const struct lp_type type = bld->type;
2020 struct lp_type inttype;
2021 struct lp_build_context intbld;
2022 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2023 LLVMValueRef trunc, res, anosign, mask;
2024 LLVMTypeRef int_vec_type = bld->int_vec_type;
2025 LLVMTypeRef vec_type = bld->vec_type;
2026
2027 if (type.width != 32) {
2028 char intrinsic[32];
2029 util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
2030 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2031 }
2032
2033 assert(type.width == 32); /* might want to handle doubles at some point */
2034
2035 inttype = type;
2036 inttype.floating = 0;
2037 lp_build_context_init(&intbld, bld->gallivm, inttype);
2038
2039 /* round by truncation */
2040 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2041 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2042
2043 if (type.sign) {
2044 LLVMValueRef tmp;
2045
2046 /*
2047 * fix values if rounding is wrong (for non-special cases)
2048 * - this is the case if trunc > a
2049 */
2050 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2051 /* tmp = trunc > a ? 1.0 : 0.0 */
2052 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2053 tmp = lp_build_and(&intbld, mask, tmp);
2054 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2055 res = lp_build_sub(bld, res, tmp);
2056 }
2057
2058 /* mask out sign bit */
2059 anosign = lp_build_abs(bld, a);
2060 /*
2061 * mask out all values if anosign > 2^24
2062 * This should work both for large ints (all rounding is no-op for them
2063 * because such floats are always exact) as well as special cases like
2064 * NaNs, Infs (taking advantage of the fact they use max exponent).
2065 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2066 */
2067 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2068 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2069 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2070 return lp_build_select(bld, mask, a, res);
2071 }
2072 }
2073
2074
2075 /**
2076 * Return ceiling of float (vector), returning float (vector).
2077 * Ex: ceil( 1.1) = 2.0
2078 * Ex: ceil(-1.1) = -1.0
2079 */
2080 LLVMValueRef
2081 lp_build_ceil(struct lp_build_context *bld,
2082 LLVMValueRef a)
2083 {
2084 LLVMBuilderRef builder = bld->gallivm->builder;
2085 const struct lp_type type = bld->type;
2086
2087 assert(type.floating);
2088 assert(lp_check_value(type, a));
2089
2090 if (arch_rounding_available(type)) {
2091 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2092 }
2093 else {
2094 const struct lp_type type = bld->type;
2095 struct lp_type inttype;
2096 struct lp_build_context intbld;
2097 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2098 LLVMValueRef trunc, res, anosign, mask, tmp;
2099 LLVMTypeRef int_vec_type = bld->int_vec_type;
2100 LLVMTypeRef vec_type = bld->vec_type;
2101
2102 if (type.width != 32) {
2103 char intrinsic[32];
2104 util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
2105 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2106 }
2107
2108 assert(type.width == 32); /* might want to handle doubles at some point */
2109
2110 inttype = type;
2111 inttype.floating = 0;
2112 lp_build_context_init(&intbld, bld->gallivm, inttype);
2113
2114 /* round by truncation */
2115 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2116 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2117
2118 /*
2119 * fix values if rounding is wrong (for non-special cases)
2120 * - this is the case if trunc < a
2121 */
2122 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2123 /* tmp = trunc < a ? 1.0 : 0.0 */
2124 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2125 tmp = lp_build_and(&intbld, mask, tmp);
2126 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2127 res = lp_build_add(bld, trunc, tmp);
2128
2129 /* mask out sign bit */
2130 anosign = lp_build_abs(bld, a);
2131 /*
2132 * mask out all values if anosign > 2^24
2133 * This should work both for large ints (all rounding is no-op for them
2134 * because such floats are always exact) as well as special cases like
2135 * NaNs, Infs (taking advantage of the fact they use max exponent).
2136 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2137 */
2138 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2139 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2140 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2141 return lp_build_select(bld, mask, a, res);
2142 }
2143 }
2144
2145
2146 /**
2147 * Return fractional part of 'a' computed as a - floor(a)
2148 * Typically used in texture coord arithmetic.
2149 */
2150 LLVMValueRef
2151 lp_build_fract(struct lp_build_context *bld,
2152 LLVMValueRef a)
2153 {
2154 assert(bld->type.floating);
2155 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2156 }
2157
2158
2159 /**
2160 * Prevent returning a fractional part of 1.0 for very small negative values of
2161 * 'a' by clamping against 0.99999(9).
2162 */
2163 static inline LLVMValueRef
2164 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2165 {
2166 LLVMValueRef max;
2167
2168 /* this is the largest number smaller than 1.0 representable as float */
2169 max = lp_build_const_vec(bld->gallivm, bld->type,
2170 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2171 return lp_build_min(bld, fract, max);
2172 }
2173
2174
2175 /**
2176 * Same as lp_build_fract, but guarantees that the result is always smaller
2177 * than one.
2178 */
2179 LLVMValueRef
2180 lp_build_fract_safe(struct lp_build_context *bld,
2181 LLVMValueRef a)
2182 {
2183 return clamp_fract(bld, lp_build_fract(bld, a));
2184 }
2185
2186
2187 /**
2188 * Return the integer part of a float (vector) value (== round toward zero).
2189 * The returned value is an integer (vector).
2190 * Ex: itrunc(-1.5) = -1
2191 */
2192 LLVMValueRef
2193 lp_build_itrunc(struct lp_build_context *bld,
2194 LLVMValueRef a)
2195 {
2196 LLVMBuilderRef builder = bld->gallivm->builder;
2197 const struct lp_type type = bld->type;
2198 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2199
2200 assert(type.floating);
2201 assert(lp_check_value(type, a));
2202
2203 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2204 }
2205
2206
2207 /**
2208 * Return float (vector) rounded to nearest integer (vector). The returned
2209 * value is an integer (vector).
2210 * Ex: iround(0.9) = 1
2211 * Ex: iround(-1.5) = -2
2212 */
2213 LLVMValueRef
2214 lp_build_iround(struct lp_build_context *bld,
2215 LLVMValueRef a)
2216 {
2217 LLVMBuilderRef builder = bld->gallivm->builder;
2218 const struct lp_type type = bld->type;
2219 LLVMTypeRef int_vec_type = bld->int_vec_type;
2220 LLVMValueRef res;
2221
2222 assert(type.floating);
2223
2224 assert(lp_check_value(type, a));
2225
2226 if ((util_cpu_caps.has_sse2 &&
2227 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2228 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2229 return lp_build_iround_nearest_sse2(bld, a);
2230 }
2231 if (arch_rounding_available(type)) {
2232 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2233 }
2234 else {
2235 LLVMValueRef half;
2236
2237 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2238
2239 if (type.sign) {
2240 LLVMTypeRef vec_type = bld->vec_type;
2241 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2242 (unsigned long long)1 << (type.width - 1));
2243 LLVMValueRef sign;
2244
2245 /* get sign bit */
2246 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2247 sign = LLVMBuildAnd(builder, sign, mask, "");
2248
2249 /* sign * 0.5 */
2250 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2251 half = LLVMBuildOr(builder, sign, half, "");
2252 half = LLVMBuildBitCast(builder, half, vec_type, "");
2253 }
2254
2255 res = LLVMBuildFAdd(builder, a, half, "");
2256 }
2257
2258 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2259
2260 return res;
2261 }
2262
2263
2264 /**
2265 * Return floor of float (vector), result is an int (vector)
2266 * Ex: ifloor(1.1) = 1.0
2267 * Ex: ifloor(-1.1) = -2.0
2268 */
2269 LLVMValueRef
2270 lp_build_ifloor(struct lp_build_context *bld,
2271 LLVMValueRef a)
2272 {
2273 LLVMBuilderRef builder = bld->gallivm->builder;
2274 const struct lp_type type = bld->type;
2275 LLVMTypeRef int_vec_type = bld->int_vec_type;
2276 LLVMValueRef res;
2277
2278 assert(type.floating);
2279 assert(lp_check_value(type, a));
2280
2281 res = a;
2282 if (type.sign) {
2283 if (arch_rounding_available(type)) {
2284 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2285 }
2286 else {
2287 struct lp_type inttype;
2288 struct lp_build_context intbld;
2289 LLVMValueRef trunc, itrunc, mask;
2290
2291 assert(type.floating);
2292 assert(lp_check_value(type, a));
2293
2294 inttype = type;
2295 inttype.floating = 0;
2296 lp_build_context_init(&intbld, bld->gallivm, inttype);
2297
2298 /* round by truncation */
2299 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2300 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2301
2302 /*
2303 * fix values if rounding is wrong (for non-special cases)
2304 * - this is the case if trunc > a
2305 * The results of doing this with NaNs, very large values etc.
2306 * are undefined but this seems to be the case anyway.
2307 */
2308 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2309 /* cheapie minus one with mask since the mask is minus one / zero */
2310 return lp_build_add(&intbld, itrunc, mask);
2311 }
2312 }
2313
2314 /* round to nearest (toward zero) */
2315 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2316
2317 return res;
2318 }
2319
2320
2321 /**
2322 * Return ceiling of float (vector), returning int (vector).
2323 * Ex: iceil( 1.1) = 2
2324 * Ex: iceil(-1.1) = -1
2325 */
2326 LLVMValueRef
2327 lp_build_iceil(struct lp_build_context *bld,
2328 LLVMValueRef a)
2329 {
2330 LLVMBuilderRef builder = bld->gallivm->builder;
2331 const struct lp_type type = bld->type;
2332 LLVMTypeRef int_vec_type = bld->int_vec_type;
2333 LLVMValueRef res;
2334
2335 assert(type.floating);
2336 assert(lp_check_value(type, a));
2337
2338 if (arch_rounding_available(type)) {
2339 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2340 }
2341 else {
2342 struct lp_type inttype;
2343 struct lp_build_context intbld;
2344 LLVMValueRef trunc, itrunc, mask;
2345
2346 assert(type.floating);
2347 assert(lp_check_value(type, a));
2348
2349 inttype = type;
2350 inttype.floating = 0;
2351 lp_build_context_init(&intbld, bld->gallivm, inttype);
2352
2353 /* round by truncation */
2354 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2355 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2356
2357 /*
2358 * fix values if rounding is wrong (for non-special cases)
2359 * - this is the case if trunc < a
2360 * The results of doing this with NaNs, very large values etc.
2361 * are undefined but this seems to be the case anyway.
2362 */
2363 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2364 /* cheapie plus one with mask since the mask is minus one / zero */
2365 return lp_build_sub(&intbld, itrunc, mask);
2366 }
2367
2368 /* round to nearest (toward zero) */
2369 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2370
2371 return res;
2372 }
2373
2374
2375 /**
2376 * Combined ifloor() & fract().
2377 *
2378 * Preferred to calling the functions separately, as it will ensure that the
2379 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2380 */
2381 void
2382 lp_build_ifloor_fract(struct lp_build_context *bld,
2383 LLVMValueRef a,
2384 LLVMValueRef *out_ipart,
2385 LLVMValueRef *out_fpart)
2386 {
2387 LLVMBuilderRef builder = bld->gallivm->builder;
2388 const struct lp_type type = bld->type;
2389 LLVMValueRef ipart;
2390
2391 assert(type.floating);
2392 assert(lp_check_value(type, a));
2393
2394 if (arch_rounding_available(type)) {
2395 /*
2396 * floor() is easier.
2397 */
2398
2399 ipart = lp_build_floor(bld, a);
2400 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2401 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2402 }
2403 else {
2404 /*
2405 * ifloor() is easier.
2406 */
2407
2408 *out_ipart = lp_build_ifloor(bld, a);
2409 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2410 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2411 }
2412 }
2413
2414
2415 /**
2416 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2417 * always smaller than one.
2418 */
2419 void
2420 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2421 LLVMValueRef a,
2422 LLVMValueRef *out_ipart,
2423 LLVMValueRef *out_fpart)
2424 {
2425 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2426 *out_fpart = clamp_fract(bld, *out_fpart);
2427 }
2428
2429
2430 LLVMValueRef
2431 lp_build_sqrt(struct lp_build_context *bld,
2432 LLVMValueRef a)
2433 {
2434 LLVMBuilderRef builder = bld->gallivm->builder;
2435 const struct lp_type type = bld->type;
2436 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2437 char intrinsic[32];
2438
2439 assert(lp_check_value(type, a));
2440
2441 /* TODO: optimize the constant case */
2442
2443 assert(type.floating);
2444 if (type.length == 1) {
2445 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2446 }
2447 else {
2448 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2449 }
2450
2451 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2452 }
2453
2454
2455 /**
2456 * Do one Newton-Raphson step to improve reciprocate precision:
2457 *
2458 * x_{i+1} = x_i * (2 - a * x_i)
2459 *
2460 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2461 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2462 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2463 * halo. It would be necessary to clamp the argument to prevent this.
2464 *
2465 * See also:
2466 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2467 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2468 */
2469 static inline LLVMValueRef
2470 lp_build_rcp_refine(struct lp_build_context *bld,
2471 LLVMValueRef a,
2472 LLVMValueRef rcp_a)
2473 {
2474 LLVMBuilderRef builder = bld->gallivm->builder;
2475 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2476 LLVMValueRef res;
2477
2478 res = LLVMBuildFMul(builder, a, rcp_a, "");
2479 res = LLVMBuildFSub(builder, two, res, "");
2480 res = LLVMBuildFMul(builder, rcp_a, res, "");
2481
2482 return res;
2483 }
2484
2485
2486 LLVMValueRef
2487 lp_build_rcp(struct lp_build_context *bld,
2488 LLVMValueRef a)
2489 {
2490 LLVMBuilderRef builder = bld->gallivm->builder;
2491 const struct lp_type type = bld->type;
2492
2493 assert(lp_check_value(type, a));
2494
2495 if(a == bld->zero)
2496 return bld->undef;
2497 if(a == bld->one)
2498 return bld->one;
2499 if(a == bld->undef)
2500 return bld->undef;
2501
2502 assert(type.floating);
2503
2504 if(LLVMIsConstant(a))
2505 return LLVMConstFDiv(bld->one, a);
2506
2507 /*
2508 * We don't use RCPPS because:
2509 * - it only has 10bits of precision
2510 * - it doesn't even get the reciprocate of 1.0 exactly
2511 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2512 * - for recent processors the benefit over DIVPS is marginal, a case
2513 * dependent
2514 *
2515 * We could still use it on certain processors if benchmarks show that the
2516 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2517 * particular uses that require less workarounds.
2518 */
2519
2520 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2521 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2522 const unsigned num_iterations = 0;
2523 LLVMValueRef res;
2524 unsigned i;
2525 const char *intrinsic = NULL;
2526
2527 if (type.length == 4) {
2528 intrinsic = "llvm.x86.sse.rcp.ps";
2529 }
2530 else {
2531 intrinsic = "llvm.x86.avx.rcp.ps.256";
2532 }
2533
2534 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2535
2536 for (i = 0; i < num_iterations; ++i) {
2537 res = lp_build_rcp_refine(bld, a, res);
2538 }
2539
2540 return res;
2541 }
2542
2543 return LLVMBuildFDiv(builder, bld->one, a, "");
2544 }
2545
2546
2547 /**
2548 * Do one Newton-Raphson step to improve rsqrt precision:
2549 *
2550 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2551 *
2552 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2553 */
2554 static inline LLVMValueRef
2555 lp_build_rsqrt_refine(struct lp_build_context *bld,
2556 LLVMValueRef a,
2557 LLVMValueRef rsqrt_a)
2558 {
2559 LLVMBuilderRef builder = bld->gallivm->builder;
2560 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2561 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2562 LLVMValueRef res;
2563
2564 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2565 res = LLVMBuildFMul(builder, a, res, "");
2566 res = LLVMBuildFSub(builder, three, res, "");
2567 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2568 res = LLVMBuildFMul(builder, half, res, "");
2569
2570 return res;
2571 }
2572
2573
2574 /**
2575 * Generate 1/sqrt(a).
2576 * Result is undefined for values < 0, infinity for +0.
2577 */
2578 LLVMValueRef
2579 lp_build_rsqrt(struct lp_build_context *bld,
2580 LLVMValueRef a)
2581 {
2582 const struct lp_type type = bld->type;
2583
2584 assert(lp_check_value(type, a));
2585
2586 assert(type.floating);
2587
2588 /*
2589 * This should be faster but all denormals will end up as infinity.
2590 */
2591 if (0 && lp_build_fast_rsqrt_available(type)) {
2592 const unsigned num_iterations = 1;
2593 LLVMValueRef res;
2594 unsigned i;
2595
2596 /* rsqrt(1.0) != 1.0 here */
2597 res = lp_build_fast_rsqrt(bld, a);
2598
2599 if (num_iterations) {
2600 /*
2601 * Newton-Raphson will result in NaN instead of infinity for zero,
2602 * and NaN instead of zero for infinity.
2603 * Also, need to ensure rsqrt(1.0) == 1.0.
2604 * All numbers smaller than FLT_MIN will result in +infinity
2605 * (rsqrtps treats all denormals as zero).
2606 */
2607 LLVMValueRef cmp;
2608 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2609 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2610
2611 for (i = 0; i < num_iterations; ++i) {
2612 res = lp_build_rsqrt_refine(bld, a, res);
2613 }
2614 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2615 res = lp_build_select(bld, cmp, inf, res);
2616 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2617 res = lp_build_select(bld, cmp, bld->zero, res);
2618 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2619 res = lp_build_select(bld, cmp, bld->one, res);
2620 }
2621
2622 return res;
2623 }
2624
2625 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2626 }
2627
2628 /**
2629 * If there's a fast (inaccurate) rsqrt instruction available
2630 * (caller may want to avoid to call rsqrt_fast if it's not available,
2631 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2632 * unavailable it would result in sqrt/div/mul so obviously
2633 * much better to just call sqrt, skipping both div and mul).
2634 */
2635 boolean
2636 lp_build_fast_rsqrt_available(struct lp_type type)
2637 {
2638 assert(type.floating);
2639
2640 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2641 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2642 return true;
2643 }
2644 return false;
2645 }
2646
2647
2648 /**
2649 * Generate 1/sqrt(a).
2650 * Result is undefined for values < 0, infinity for +0.
2651 * Precision is limited, only ~10 bits guaranteed
2652 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2653 */
2654 LLVMValueRef
2655 lp_build_fast_rsqrt(struct lp_build_context *bld,
2656 LLVMValueRef a)
2657 {
2658 LLVMBuilderRef builder = bld->gallivm->builder;
2659 const struct lp_type type = bld->type;
2660
2661 assert(lp_check_value(type, a));
2662
2663 if (lp_build_fast_rsqrt_available(type)) {
2664 const char *intrinsic = NULL;
2665
2666 if (type.length == 4) {
2667 intrinsic = "llvm.x86.sse.rsqrt.ps";
2668 }
2669 else {
2670 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2671 }
2672 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2673 }
2674 else {
2675 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2676 }
2677 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2678 }
2679
2680
2681 /**
2682 * Generate sin(a) or cos(a) using polynomial approximation.
2683 * TODO: it might be worth recognizing sin and cos using same source
2684 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2685 * would be way cheaper than calculating (nearly) everything twice...
2686 * Not sure it's common enough to be worth bothering however, scs
2687 * opcode could also benefit from calculating both though.
2688 */
2689 static LLVMValueRef
2690 lp_build_sin_or_cos(struct lp_build_context *bld,
2691 LLVMValueRef a,
2692 boolean cos)
2693 {
2694 struct gallivm_state *gallivm = bld->gallivm;
2695 LLVMBuilderRef b = gallivm->builder;
2696 struct lp_type int_type = lp_int_type(bld->type);
2697
2698 /*
2699 * take the absolute value,
2700 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2701 */
2702
2703 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2704 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2705
2706 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2707 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2708
2709 /*
2710 * scale by 4/Pi
2711 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2712 */
2713
2714 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2715 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2716
2717 /*
2718 * store the integer part of y in mm0
2719 * emm2 = _mm_cvttps_epi32(y);
2720 */
2721
2722 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2723
2724 /*
2725 * j=(j+1) & (~1) (see the cephes sources)
2726 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2727 */
2728
2729 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2730 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2731 /*
2732 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2733 */
2734 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2735 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2736
2737 /*
2738 * y = _mm_cvtepi32_ps(emm2);
2739 */
2740 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2741
2742 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2743 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2744 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2745 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2746
2747 /*
2748 * Argument used for poly selection and sign bit determination
2749 * is different for sin vs. cos.
2750 */
2751 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2752 emm2_and;
2753
2754 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2755 LLVMBuildNot(b, emm2_2, ""), ""),
2756 const_29, "sign_bit") :
2757 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2758 LLVMBuildShl(b, emm2_add,
2759 const_29, ""), ""),
2760 sign_mask, "sign_bit");
2761
2762 /*
2763 * get the polynom selection mask
2764 * there is one polynom for 0 <= x <= Pi/4
2765 * and another one for Pi/4<x<=Pi/2
2766 * Both branches will be computed.
2767 *
2768 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2769 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2770 */
2771
2772 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2773 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2774 int_type, PIPE_FUNC_EQUAL,
2775 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2776
2777 /*
2778 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2779 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2780 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2781 */
2782 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2783 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2784 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2785
2786 /*
2787 * The magic pass: "Extended precision modular arithmetic"
2788 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2789 * xmm1 = _mm_mul_ps(y, xmm1);
2790 * xmm2 = _mm_mul_ps(y, xmm2);
2791 * xmm3 = _mm_mul_ps(y, xmm3);
2792 */
2793 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2794 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2795 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2796
2797 /*
2798 * x = _mm_add_ps(x, xmm1);
2799 * x = _mm_add_ps(x, xmm2);
2800 * x = _mm_add_ps(x, xmm3);
2801 */
2802
2803 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2804 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2805 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2806
2807 /*
2808 * Evaluate the first polynom (0 <= x <= Pi/4)
2809 *
2810 * z = _mm_mul_ps(x,x);
2811 */
2812 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2813
2814 /*
2815 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2816 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2817 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2818 */
2819 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2820 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2821 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2822
2823 /*
2824 * y = *(v4sf*)_ps_coscof_p0;
2825 * y = _mm_mul_ps(y, z);
2826 */
2827 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2828 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2829 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2830 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2831 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2832 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2833
2834
2835 /*
2836 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2837 * y = _mm_sub_ps(y, tmp);
2838 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2839 */
2840 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2841 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2842 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2843 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2844 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2845
2846 /*
2847 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2848 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2849 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2850 */
2851 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2852 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2853 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2854
2855 /*
2856 * Evaluate the second polynom (Pi/4 <= x <= 0)
2857 *
2858 * y2 = *(v4sf*)_ps_sincof_p0;
2859 * y2 = _mm_mul_ps(y2, z);
2860 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2861 * y2 = _mm_mul_ps(y2, z);
2862 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2863 * y2 = _mm_mul_ps(y2, z);
2864 * y2 = _mm_mul_ps(y2, x);
2865 * y2 = _mm_add_ps(y2, x);
2866 */
2867
2868 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2869 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2870 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2871 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2872 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2873 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2874 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2875
2876 /*
2877 * select the correct result from the two polynoms
2878 * xmm3 = poly_mask;
2879 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2880 * y = _mm_andnot_ps(xmm3, y);
2881 * y = _mm_or_ps(y,y2);
2882 */
2883 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2884 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2885 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2886 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2887 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2888 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2889
2890 /*
2891 * update the sign
2892 * y = _mm_xor_ps(y, sign_bit);
2893 */
2894 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2895 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2896
2897 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2898
2899 /* clamp output to be within [-1, 1] */
2900 y_result = lp_build_clamp(bld, y_result,
2901 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2902 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2903 /* If a is -inf, inf or NaN then return NaN */
2904 y_result = lp_build_select(bld, isfinite, y_result,
2905 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2906 return y_result;
2907 }
2908
2909
2910 /**
2911 * Generate sin(a)
2912 */
2913 LLVMValueRef
2914 lp_build_sin(struct lp_build_context *bld,
2915 LLVMValueRef a)
2916 {
2917 return lp_build_sin_or_cos(bld, a, FALSE);
2918 }
2919
2920
2921 /**
2922 * Generate cos(a)
2923 */
2924 LLVMValueRef
2925 lp_build_cos(struct lp_build_context *bld,
2926 LLVMValueRef a)
2927 {
2928 return lp_build_sin_or_cos(bld, a, TRUE);
2929 }
2930
2931
2932 /**
2933 * Generate pow(x, y)
2934 */
2935 LLVMValueRef
2936 lp_build_pow(struct lp_build_context *bld,
2937 LLVMValueRef x,
2938 LLVMValueRef y)
2939 {
2940 /* TODO: optimize the constant case */
2941 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2942 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2943 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2944 __FUNCTION__);
2945 }
2946
2947 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2948 }
2949
2950
2951 /**
2952 * Generate exp(x)
2953 */
2954 LLVMValueRef
2955 lp_build_exp(struct lp_build_context *bld,
2956 LLVMValueRef x)
2957 {
2958 /* log2(e) = 1/log(2) */
2959 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2960 1.4426950408889634);
2961
2962 assert(lp_check_value(bld->type, x));
2963
2964 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2965 }
2966
2967
2968 /**
2969 * Generate log(x)
2970 * Behavior is undefined with infs, 0s and nans
2971 */
2972 LLVMValueRef
2973 lp_build_log(struct lp_build_context *bld,
2974 LLVMValueRef x)
2975 {
2976 /* log(2) */
2977 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2978 0.69314718055994529);
2979
2980 assert(lp_check_value(bld->type, x));
2981
2982 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2983 }
2984
2985 /**
2986 * Generate log(x) that handles edge cases (infs, 0s and nans)
2987 */
2988 LLVMValueRef
2989 lp_build_log_safe(struct lp_build_context *bld,
2990 LLVMValueRef x)
2991 {
2992 /* log(2) */
2993 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2994 0.69314718055994529);
2995
2996 assert(lp_check_value(bld->type, x));
2997
2998 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2999 }
3000
3001
3002 /**
3003 * Generate polynomial.
3004 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3005 */
3006 LLVMValueRef
3007 lp_build_polynomial(struct lp_build_context *bld,
3008 LLVMValueRef x,
3009 const double *coeffs,
3010 unsigned num_coeffs)
3011 {
3012 const struct lp_type type = bld->type;
3013 LLVMValueRef even = NULL, odd = NULL;
3014 LLVMValueRef x2;
3015 unsigned i;
3016
3017 assert(lp_check_value(bld->type, x));
3018
3019 /* TODO: optimize the constant case */
3020 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3021 LLVMIsConstant(x)) {
3022 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3023 __FUNCTION__);
3024 }
3025
3026 /*
3027 * Calculate odd and even terms seperately to decrease data dependency
3028 * Ex:
3029 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3030 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3031 */
3032 x2 = lp_build_mul(bld, x, x);
3033
3034 for (i = num_coeffs; i--; ) {
3035 LLVMValueRef coeff;
3036
3037 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3038
3039 if (i % 2 == 0) {
3040 if (even)
3041 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3042 else
3043 even = coeff;
3044 } else {
3045 if (odd)
3046 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3047 else
3048 odd = coeff;
3049 }
3050 }
3051
3052 if (odd)
3053 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3054 else if (even)
3055 return even;
3056 else
3057 return bld->undef;
3058 }
3059
3060
3061 /**
3062 * Minimax polynomial fit of 2**x, in range [0, 1[
3063 */
3064 const double lp_build_exp2_polynomial[] = {
3065 #if EXP_POLY_DEGREE == 5
3066 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3067 0.693153073200168932794,
3068 0.240153617044375388211,
3069 0.0558263180532956664775,
3070 0.00898934009049466391101,
3071 0.00187757667519147912699
3072 #elif EXP_POLY_DEGREE == 4
3073 1.00000259337069434683,
3074 0.693003834469974940458,
3075 0.24144275689150793076,
3076 0.0520114606103070150235,
3077 0.0135341679161270268764
3078 #elif EXP_POLY_DEGREE == 3
3079 0.999925218562710312959,
3080 0.695833540494823811697,
3081 0.226067155427249155588,
3082 0.0780245226406372992967
3083 #elif EXP_POLY_DEGREE == 2
3084 1.00172476321474503578,
3085 0.657636275736077639316,
3086 0.33718943461968720704
3087 #else
3088 #error
3089 #endif
3090 };
3091
3092
3093 LLVMValueRef
3094 lp_build_exp2(struct lp_build_context *bld,
3095 LLVMValueRef x)
3096 {
3097 LLVMBuilderRef builder = bld->gallivm->builder;
3098 const struct lp_type type = bld->type;
3099 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3100 LLVMValueRef ipart = NULL;
3101 LLVMValueRef fpart = NULL;
3102 LLVMValueRef expipart = NULL;
3103 LLVMValueRef expfpart = NULL;
3104 LLVMValueRef res = NULL;
3105
3106 assert(lp_check_value(bld->type, x));
3107
3108 /* TODO: optimize the constant case */
3109 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3110 LLVMIsConstant(x)) {
3111 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3112 __FUNCTION__);
3113 }
3114
3115 assert(type.floating && type.width == 32);
3116
3117 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3118 * the result is INF and if it's smaller than -126.9 the result is 0 */
3119 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3120 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3121 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3122 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3123
3124 /* ipart = floor(x) */
3125 /* fpart = x - ipart */
3126 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3127
3128 /* expipart = (float) (1 << ipart) */
3129 expipart = LLVMBuildAdd(builder, ipart,
3130 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3131 expipart = LLVMBuildShl(builder, expipart,
3132 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3133 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3134
3135 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3136 Elements(lp_build_exp2_polynomial));
3137
3138 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3139
3140 return res;
3141 }
3142
3143
3144
3145 /**
3146 * Extract the exponent of a IEEE-754 floating point value.
3147 *
3148 * Optionally apply an integer bias.
3149 *
3150 * Result is an integer value with
3151 *
3152 * ifloor(log2(x)) + bias
3153 */
3154 LLVMValueRef
3155 lp_build_extract_exponent(struct lp_build_context *bld,
3156 LLVMValueRef x,
3157 int bias)
3158 {
3159 LLVMBuilderRef builder = bld->gallivm->builder;
3160 const struct lp_type type = bld->type;
3161 unsigned mantissa = lp_mantissa(type);
3162 LLVMValueRef res;
3163
3164 assert(type.floating);
3165
3166 assert(lp_check_value(bld->type, x));
3167
3168 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3169
3170 res = LLVMBuildLShr(builder, x,
3171 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3172 res = LLVMBuildAnd(builder, res,
3173 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3174 res = LLVMBuildSub(builder, res,
3175 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3176
3177 return res;
3178 }
3179
3180
3181 /**
3182 * Extract the mantissa of the a floating.
3183 *
3184 * Result is a floating point value with
3185 *
3186 * x / floor(log2(x))
3187 */
3188 LLVMValueRef
3189 lp_build_extract_mantissa(struct lp_build_context *bld,
3190 LLVMValueRef x)
3191 {
3192 LLVMBuilderRef builder = bld->gallivm->builder;
3193 const struct lp_type type = bld->type;
3194 unsigned mantissa = lp_mantissa(type);
3195 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3196 (1ULL << mantissa) - 1);
3197 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3198 LLVMValueRef res;
3199
3200 assert(lp_check_value(bld->type, x));
3201
3202 assert(type.floating);
3203
3204 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3205
3206 /* res = x / 2**ipart */
3207 res = LLVMBuildAnd(builder, x, mantmask, "");
3208 res = LLVMBuildOr(builder, res, one, "");
3209 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3210
3211 return res;
3212 }
3213
3214
3215
3216 /**
3217 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3218 * These coefficients can be generate with
3219 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3220 */
3221 const double lp_build_log2_polynomial[] = {
3222 #if LOG_POLY_DEGREE == 5
3223 2.88539008148777786488L,
3224 0.961796878841293367824L,
3225 0.577058946784739859012L,
3226 0.412914355135828735411L,
3227 0.308591899232910175289L,
3228 0.352376952300281371868L,
3229 #elif LOG_POLY_DEGREE == 4
3230 2.88539009343309178325L,
3231 0.961791550404184197881L,
3232 0.577440339438736392009L,
3233 0.403343858251329912514L,
3234 0.406718052498846252698L,
3235 #elif LOG_POLY_DEGREE == 3
3236 2.88538959748872753838L,
3237 0.961932915889597772928L,
3238 0.571118517972136195241L,
3239 0.493997535084709500285L,
3240 #else
3241 #error
3242 #endif
3243 };
3244
3245 /**
3246 * See http://www.devmaster.net/forums/showthread.php?p=43580
3247 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3248 * http://www.nezumi.demon.co.uk/consult/logx.htm
3249 *
3250 * If handle_edge_cases is true the function will perform computations
3251 * to match the required D3D10+ behavior for each of the edge cases.
3252 * That means that if input is:
3253 * - less than zero (to and including -inf) then NaN will be returned
3254 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3255 * - +infinity, then +infinity will be returned
3256 * - NaN, then NaN will be returned
3257 *
3258 * Those checks are fairly expensive so if you don't need them make sure
3259 * handle_edge_cases is false.
3260 */
3261 void
3262 lp_build_log2_approx(struct lp_build_context *bld,
3263 LLVMValueRef x,
3264 LLVMValueRef *p_exp,
3265 LLVMValueRef *p_floor_log2,
3266 LLVMValueRef *p_log2,
3267 boolean handle_edge_cases)
3268 {
3269 LLVMBuilderRef builder = bld->gallivm->builder;
3270 const struct lp_type type = bld->type;
3271 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3272 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3273
3274 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3275 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3276 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3277
3278 LLVMValueRef i = NULL;
3279 LLVMValueRef y = NULL;
3280 LLVMValueRef z = NULL;
3281 LLVMValueRef exp = NULL;
3282 LLVMValueRef mant = NULL;
3283 LLVMValueRef logexp = NULL;
3284 LLVMValueRef logmant = NULL;
3285 LLVMValueRef res = NULL;
3286
3287 assert(lp_check_value(bld->type, x));
3288
3289 if(p_exp || p_floor_log2 || p_log2) {
3290 /* TODO: optimize the constant case */
3291 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3292 LLVMIsConstant(x)) {
3293 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3294 __FUNCTION__);
3295 }
3296
3297 assert(type.floating && type.width == 32);
3298
3299 /*
3300 * We don't explicitly handle denormalized numbers. They will yield a
3301 * result in the neighbourhood of -127, which appears to be adequate
3302 * enough.
3303 */
3304
3305 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3306
3307 /* exp = (float) exponent(x) */
3308 exp = LLVMBuildAnd(builder, i, expmask, "");
3309 }
3310
3311 if(p_floor_log2 || p_log2) {
3312 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3313 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3314 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3315 }
3316
3317 if (p_log2) {
3318 /* mant = 1 + (float) mantissa(x) */
3319 mant = LLVMBuildAnd(builder, i, mantmask, "");
3320 mant = LLVMBuildOr(builder, mant, one, "");
3321 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3322
3323 /* y = (mant - 1) / (mant + 1) */
3324 y = lp_build_div(bld,
3325 lp_build_sub(bld, mant, bld->one),
3326 lp_build_add(bld, mant, bld->one)
3327 );
3328
3329 /* z = y^2 */
3330 z = lp_build_mul(bld, y, y);
3331
3332 /* compute P(z) */
3333 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3334 Elements(lp_build_log2_polynomial));
3335
3336 /* logmant = y * P(z) */
3337 logmant = lp_build_mul(bld, y, logmant);
3338
3339 res = lp_build_add(bld, logmant, logexp);
3340
3341 if (type.floating && handle_edge_cases) {
3342 LLVMValueRef negmask, infmask, zmask;
3343 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3344 lp_build_const_vec(bld->gallivm, type, 0.0f));
3345 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3346 lp_build_const_vec(bld->gallivm, type, 0.0f));
3347 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3348 lp_build_const_vec(bld->gallivm, type, INFINITY));
3349
3350 /* If x is qual to inf make sure we return inf */
3351 res = lp_build_select(bld, infmask,
3352 lp_build_const_vec(bld->gallivm, type, INFINITY),
3353 res);
3354 /* If x is qual to 0, return -inf */
3355 res = lp_build_select(bld, zmask,
3356 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3357 res);
3358 /* If x is nan or less than 0, return nan */
3359 res = lp_build_select(bld, negmask,
3360 lp_build_const_vec(bld->gallivm, type, NAN),
3361 res);
3362 }
3363 }
3364
3365 if (p_exp) {
3366 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3367 *p_exp = exp;
3368 }
3369
3370 if (p_floor_log2)
3371 *p_floor_log2 = logexp;
3372
3373 if (p_log2)
3374 *p_log2 = res;
3375 }
3376
3377
3378 /*
3379 * log2 implementation which doesn't have special code to
3380 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3381 * the results for those cases are undefined.
3382 */
3383 LLVMValueRef
3384 lp_build_log2(struct lp_build_context *bld,
3385 LLVMValueRef x)
3386 {
3387 LLVMValueRef res;
3388 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3389 return res;
3390 }
3391
3392 /*
3393 * Version of log2 which handles all edge cases.
3394 * Look at documentation of lp_build_log2_approx for
3395 * description of the behavior for each of the edge cases.
3396 */
3397 LLVMValueRef
3398 lp_build_log2_safe(struct lp_build_context *bld,
3399 LLVMValueRef x)
3400 {
3401 LLVMValueRef res;
3402 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3403 return res;
3404 }
3405
3406
3407 /**
3408 * Faster (and less accurate) log2.
3409 *
3410 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3411 *
3412 * Piece-wise linear approximation, with exact results when x is a
3413 * power of two.
3414 *
3415 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3416 */
3417 LLVMValueRef
3418 lp_build_fast_log2(struct lp_build_context *bld,
3419 LLVMValueRef x)
3420 {
3421 LLVMBuilderRef builder = bld->gallivm->builder;
3422 LLVMValueRef ipart;
3423 LLVMValueRef fpart;
3424
3425 assert(lp_check_value(bld->type, x));
3426
3427 assert(bld->type.floating);
3428
3429 /* ipart = floor(log2(x)) - 1 */
3430 ipart = lp_build_extract_exponent(bld, x, -1);
3431 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3432
3433 /* fpart = x / 2**ipart */
3434 fpart = lp_build_extract_mantissa(bld, x);
3435
3436 /* ipart + fpart */
3437 return LLVMBuildFAdd(builder, ipart, fpart, "");
3438 }
3439
3440
3441 /**
3442 * Fast implementation of iround(log2(x)).
3443 *
3444 * Not an approximation -- it should give accurate results all the time.
3445 */
3446 LLVMValueRef
3447 lp_build_ilog2(struct lp_build_context *bld,
3448 LLVMValueRef x)
3449 {
3450 LLVMBuilderRef builder = bld->gallivm->builder;
3451 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3452 LLVMValueRef ipart;
3453
3454 assert(bld->type.floating);
3455
3456 assert(lp_check_value(bld->type, x));
3457
3458 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3459 x = LLVMBuildFMul(builder, x, sqrt2, "");
3460
3461 /* ipart = floor(log2(x) + 0.5) */
3462 ipart = lp_build_extract_exponent(bld, x, 0);
3463
3464 return ipart;
3465 }
3466
3467 LLVMValueRef
3468 lp_build_mod(struct lp_build_context *bld,
3469 LLVMValueRef x,
3470 LLVMValueRef y)
3471 {
3472 LLVMBuilderRef builder = bld->gallivm->builder;
3473 LLVMValueRef res;
3474 const struct lp_type type = bld->type;
3475
3476 assert(lp_check_value(type, x));
3477 assert(lp_check_value(type, y));
3478
3479 if (type.floating)
3480 res = LLVMBuildFRem(builder, x, y, "");
3481 else if (type.sign)
3482 res = LLVMBuildSRem(builder, x, y, "");
3483 else
3484 res = LLVMBuildURem(builder, x, y, "");
3485 return res;
3486 }
3487
3488
3489 /*
3490 * For floating inputs it creates and returns a mask
3491 * which is all 1's for channels which are NaN.
3492 * Channels inside x which are not NaN will be 0.
3493 */
3494 LLVMValueRef
3495 lp_build_isnan(struct lp_build_context *bld,
3496 LLVMValueRef x)
3497 {
3498 LLVMValueRef mask;
3499 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3500
3501 assert(bld->type.floating);
3502 assert(lp_check_value(bld->type, x));
3503
3504 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3505 "isnotnan");
3506 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3507 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3508 return mask;
3509 }
3510
3511 /* Returns all 1's for floating point numbers that are
3512 * finite numbers and returns all zeros for -inf,
3513 * inf and nan's */
3514 LLVMValueRef
3515 lp_build_isfinite(struct lp_build_context *bld,
3516 LLVMValueRef x)
3517 {
3518 LLVMBuilderRef builder = bld->gallivm->builder;
3519 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3520 struct lp_type int_type = lp_int_type(bld->type);
3521 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3522 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3523 0x7f800000);
3524
3525 if (!bld->type.floating) {
3526 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3527 }
3528 assert(bld->type.floating);
3529 assert(lp_check_value(bld->type, x));
3530 assert(bld->type.width == 32);
3531
3532 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3533 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3534 intx, infornan32);
3535 }
3536
3537 /*
3538 * Returns true if the number is nan or inf and false otherwise.
3539 * The input has to be a floating point vector.
3540 */
3541 LLVMValueRef
3542 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3543 const struct lp_type type,
3544 LLVMValueRef x)
3545 {
3546 LLVMBuilderRef builder = gallivm->builder;
3547 struct lp_type int_type = lp_int_type(type);
3548 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3549 0x7f800000);
3550 LLVMValueRef ret;
3551
3552 assert(type.floating);
3553
3554 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3555 ret = LLVMBuildAnd(builder, ret, const0, "");
3556 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3557 ret, const0);
3558
3559 return ret;
3560 }
3561
3562
3563 LLVMValueRef
3564 lp_build_fpstate_get(struct gallivm_state *gallivm)
3565 {
3566 if (util_cpu_caps.has_sse) {
3567 LLVMBuilderRef builder = gallivm->builder;
3568 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3569 gallivm,
3570 LLVMInt32TypeInContext(gallivm->context),
3571 "mxcsr_ptr");
3572 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3573 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3574 lp_build_intrinsic(builder,
3575 "llvm.x86.sse.stmxcsr",
3576 LLVMVoidTypeInContext(gallivm->context),
3577 &mxcsr_ptr8, 1, 0);
3578 return mxcsr_ptr;
3579 }
3580 return 0;
3581 }
3582
3583 void
3584 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3585 boolean zero)
3586 {
3587 if (util_cpu_caps.has_sse) {
3588 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3589 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3590
3591 LLVMBuilderRef builder = gallivm->builder;
3592 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3593 LLVMValueRef mxcsr =
3594 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3595
3596 if (util_cpu_caps.has_daz) {
3597 /* Enable denormals are zero mode */
3598 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3599 }
3600 if (zero) {
3601 mxcsr = LLVMBuildOr(builder, mxcsr,
3602 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3603 } else {
3604 mxcsr = LLVMBuildAnd(builder, mxcsr,
3605 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3606 }
3607
3608 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3609 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3610 }
3611 }
3612
3613 void
3614 lp_build_fpstate_set(struct gallivm_state *gallivm,
3615 LLVMValueRef mxcsr_ptr)
3616 {
3617 if (util_cpu_caps.has_sse) {
3618 LLVMBuilderRef builder = gallivm->builder;
3619 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3620 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3621 lp_build_intrinsic(builder,
3622 "llvm.x86.sse.ldmxcsr",
3623 LLVMVoidTypeInContext(gallivm->context),
3624 &mxcsr_ptr, 1, 0);
3625 }
3626 }