gallivm: add LLVMAttribute parameter to lp_build_intrinsic
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
69 #endif
70
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
73 #endif
74
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
77 #endif
78
79 #define EXP_POLY_DEGREE 5
80
81 #define LOG_POLY_DEGREE 4
82
83
84 /**
85 * Generate min(a, b)
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
89 */
90 static LLVMValueRef
91 lp_build_min_simple(struct lp_build_context *bld,
92 LLVMValueRef a,
93 LLVMValueRef b,
94 enum gallivm_nan_behavior nan_behavior)
95 {
96 const struct lp_type type = bld->type;
97 const char *intrinsic = NULL;
98 unsigned intr_size = 0;
99 LLVMValueRef cond;
100
101 assert(lp_check_value(type, a));
102 assert(lp_check_value(type, b));
103
104 /* TODO: optimize the constant case */
105
106 if (type.floating && util_cpu_caps.has_sse) {
107 if (type.width == 32) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse.min.ss";
110 intr_size = 128;
111 }
112 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse.min.ps";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.ps.256";
118 intr_size = 256;
119 }
120 }
121 if (type.width == 64 && util_cpu_caps.has_sse2) {
122 if (type.length == 1) {
123 intrinsic = "llvm.x86.sse2.min.sd";
124 intr_size = 128;
125 }
126 else if (type.length == 2 || !util_cpu_caps.has_avx) {
127 intrinsic = "llvm.x86.sse2.min.pd";
128 intr_size = 128;
129 }
130 else {
131 intrinsic = "llvm.x86.avx.min.pd.256";
132 intr_size = 256;
133 }
134 }
135 }
136 else if (type.floating && util_cpu_caps.has_altivec) {
137 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
138 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
147 intr_size = 128;
148 if ((type.width == 8 || type.width == 16) &&
149 (type.width * type.length <= 64) &&
150 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
151 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
152 __FUNCTION__);
153 }
154 if (type.width == 8 && !type.sign) {
155 intrinsic = "llvm.x86.sse2.pminu.b";
156 }
157 else if (type.width == 16 && type.sign) {
158 intrinsic = "llvm.x86.sse2.pmins.w";
159 }
160 if (util_cpu_caps.has_sse4_1) {
161 if (type.width == 8 && type.sign) {
162 intrinsic = "llvm.x86.sse41.pminsb";
163 }
164 if (type.width == 16 && !type.sign) {
165 intrinsic = "llvm.x86.sse41.pminuw";
166 }
167 if (type.width == 32 && !type.sign) {
168 intrinsic = "llvm.x86.sse41.pminud";
169 }
170 if (type.width == 32 && type.sign) {
171 intrinsic = "llvm.x86.sse41.pminsd";
172 }
173 }
174 } else if (util_cpu_caps.has_altivec) {
175 intr_size = 128;
176 if (type.width == 8) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminub";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsb";
181 }
182 } else if (type.width == 16) {
183 if (!type.sign) {
184 intrinsic = "llvm.ppc.altivec.vminuh";
185 } else {
186 intrinsic = "llvm.ppc.altivec.vminsh";
187 }
188 } else if (type.width == 32) {
189 if (!type.sign) {
190 intrinsic = "llvm.ppc.altivec.vminuw";
191 } else {
192 intrinsic = "llvm.ppc.altivec.vminsw";
193 }
194 }
195 }
196
197 if(intrinsic) {
198 /* We need to handle nan's for floating point numbers. If one of the
199 * inputs is nan the other should be returned (required by both D3D10+
200 * and OpenCL).
201 * The sse intrinsics return the second operator in case of nan by
202 * default so we need to special code to handle those.
203 */
204 if (util_cpu_caps.has_sse && type.floating &&
205 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
206 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
207 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
208 LLVMValueRef isnan, min;
209 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
210 type,
211 intr_size, a, b);
212 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
213 isnan = lp_build_isnan(bld, b);
214 return lp_build_select(bld, isnan, a, min);
215 } else {
216 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
217 isnan = lp_build_isnan(bld, a);
218 return lp_build_select(bld, isnan, a, min);
219 }
220 } else {
221 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
222 type,
223 intr_size, a, b);
224 }
225 }
226
227 if (type.floating) {
228 switch (nan_behavior) {
229 case GALLIVM_NAN_RETURN_NAN: {
230 LLVMValueRef isnan = lp_build_isnan(bld, b);
231 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
232 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
233 return lp_build_select(bld, cond, a, b);
234 }
235 break;
236 case GALLIVM_NAN_RETURN_OTHER: {
237 LLVMValueRef isnan = lp_build_isnan(bld, a);
238 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
239 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
240 return lp_build_select(bld, cond, a, b);
241 }
242 break;
243 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
244 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
245 return lp_build_select(bld, cond, a, b);
246 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
247 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
248 return lp_build_select(bld, cond, b, a);
249 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 break;
253 default:
254 assert(0);
255 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
256 return lp_build_select(bld, cond, a, b);
257 }
258 } else {
259 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
260 return lp_build_select(bld, cond, a, b);
261 }
262 }
263
264
265 /**
266 * Generate max(a, b)
267 * No checks for special case values of a or b = 1 or 0 are done.
268 * NaN's are handled according to the behavior specified by the
269 * nan_behavior argument.
270 */
271 static LLVMValueRef
272 lp_build_max_simple(struct lp_build_context *bld,
273 LLVMValueRef a,
274 LLVMValueRef b,
275 enum gallivm_nan_behavior nan_behavior)
276 {
277 const struct lp_type type = bld->type;
278 const char *intrinsic = NULL;
279 unsigned intr_size = 0;
280 LLVMValueRef cond;
281
282 assert(lp_check_value(type, a));
283 assert(lp_check_value(type, b));
284
285 /* TODO: optimize the constant case */
286
287 if (type.floating && util_cpu_caps.has_sse) {
288 if (type.width == 32) {
289 if (type.length == 1) {
290 intrinsic = "llvm.x86.sse.max.ss";
291 intr_size = 128;
292 }
293 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
294 intrinsic = "llvm.x86.sse.max.ps";
295 intr_size = 128;
296 }
297 else {
298 intrinsic = "llvm.x86.avx.max.ps.256";
299 intr_size = 256;
300 }
301 }
302 if (type.width == 64 && util_cpu_caps.has_sse2) {
303 if (type.length == 1) {
304 intrinsic = "llvm.x86.sse2.max.sd";
305 intr_size = 128;
306 }
307 else if (type.length == 2 || !util_cpu_caps.has_avx) {
308 intrinsic = "llvm.x86.sse2.max.pd";
309 intr_size = 128;
310 }
311 else {
312 intrinsic = "llvm.x86.avx.max.pd.256";
313 intr_size = 256;
314 }
315 }
316 }
317 else if (type.floating && util_cpu_caps.has_altivec) {
318 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
319 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
320 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
321 __FUNCTION__);
322 }
323 if (type.width == 32 || type.length == 4) {
324 intrinsic = "llvm.ppc.altivec.vmaxfp";
325 intr_size = 128;
326 }
327 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
328 intr_size = 128;
329 if ((type.width == 8 || type.width == 16) &&
330 (type.width * type.length <= 64) &&
331 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
332 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
333 __FUNCTION__);
334 }
335 if (type.width == 8 && !type.sign) {
336 intrinsic = "llvm.x86.sse2.pmaxu.b";
337 intr_size = 128;
338 }
339 else if (type.width == 16 && type.sign) {
340 intrinsic = "llvm.x86.sse2.pmaxs.w";
341 }
342 if (util_cpu_caps.has_sse4_1) {
343 if (type.width == 8 && type.sign) {
344 intrinsic = "llvm.x86.sse41.pmaxsb";
345 }
346 if (type.width == 16 && !type.sign) {
347 intrinsic = "llvm.x86.sse41.pmaxuw";
348 }
349 if (type.width == 32 && !type.sign) {
350 intrinsic = "llvm.x86.sse41.pmaxud";
351 }
352 if (type.width == 32 && type.sign) {
353 intrinsic = "llvm.x86.sse41.pmaxsd";
354 }
355 }
356 } else if (util_cpu_caps.has_altivec) {
357 intr_size = 128;
358 if (type.width == 8) {
359 if (!type.sign) {
360 intrinsic = "llvm.ppc.altivec.vmaxub";
361 } else {
362 intrinsic = "llvm.ppc.altivec.vmaxsb";
363 }
364 } else if (type.width == 16) {
365 if (!type.sign) {
366 intrinsic = "llvm.ppc.altivec.vmaxuh";
367 } else {
368 intrinsic = "llvm.ppc.altivec.vmaxsh";
369 }
370 } else if (type.width == 32) {
371 if (!type.sign) {
372 intrinsic = "llvm.ppc.altivec.vmaxuw";
373 } else {
374 intrinsic = "llvm.ppc.altivec.vmaxsw";
375 }
376 }
377 }
378
379 if(intrinsic) {
380 if (util_cpu_caps.has_sse && type.floating &&
381 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
382 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
383 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
384 LLVMValueRef isnan, max;
385 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
386 type,
387 intr_size, a, b);
388 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
389 isnan = lp_build_isnan(bld, b);
390 return lp_build_select(bld, isnan, a, max);
391 } else {
392 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
393 isnan = lp_build_isnan(bld, a);
394 return lp_build_select(bld, isnan, a, max);
395 }
396 } else {
397 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
398 type,
399 intr_size, a, b);
400 }
401 }
402
403 if (type.floating) {
404 switch (nan_behavior) {
405 case GALLIVM_NAN_RETURN_NAN: {
406 LLVMValueRef isnan = lp_build_isnan(bld, b);
407 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
408 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
409 return lp_build_select(bld, cond, a, b);
410 }
411 break;
412 case GALLIVM_NAN_RETURN_OTHER: {
413 LLVMValueRef isnan = lp_build_isnan(bld, a);
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
416 return lp_build_select(bld, cond, a, b);
417 }
418 break;
419 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
420 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
421 return lp_build_select(bld, cond, a, b);
422 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
423 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
424 return lp_build_select(bld, cond, b, a);
425 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
426 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
427 return lp_build_select(bld, cond, a, b);
428 break;
429 default:
430 assert(0);
431 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
432 return lp_build_select(bld, cond, a, b);
433 }
434 } else {
435 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
436 return lp_build_select(bld, cond, a, b);
437 }
438 }
439
440
441 /**
442 * Generate 1 - a, or ~a depending on bld->type.
443 */
444 LLVMValueRef
445 lp_build_comp(struct lp_build_context *bld,
446 LLVMValueRef a)
447 {
448 LLVMBuilderRef builder = bld->gallivm->builder;
449 const struct lp_type type = bld->type;
450
451 assert(lp_check_value(type, a));
452
453 if(a == bld->one)
454 return bld->zero;
455 if(a == bld->zero)
456 return bld->one;
457
458 if(type.norm && !type.floating && !type.fixed && !type.sign) {
459 if(LLVMIsConstant(a))
460 return LLVMConstNot(a);
461 else
462 return LLVMBuildNot(builder, a, "");
463 }
464
465 if(LLVMIsConstant(a))
466 if (type.floating)
467 return LLVMConstFSub(bld->one, a);
468 else
469 return LLVMConstSub(bld->one, a);
470 else
471 if (type.floating)
472 return LLVMBuildFSub(builder, bld->one, a, "");
473 else
474 return LLVMBuildSub(builder, bld->one, a, "");
475 }
476
477
478 /**
479 * Generate a + b
480 */
481 LLVMValueRef
482 lp_build_add(struct lp_build_context *bld,
483 LLVMValueRef a,
484 LLVMValueRef b)
485 {
486 LLVMBuilderRef builder = bld->gallivm->builder;
487 const struct lp_type type = bld->type;
488 LLVMValueRef res;
489
490 assert(lp_check_value(type, a));
491 assert(lp_check_value(type, b));
492
493 if(a == bld->zero)
494 return b;
495 if(b == bld->zero)
496 return a;
497 if(a == bld->undef || b == bld->undef)
498 return bld->undef;
499
500 if(bld->type.norm) {
501 const char *intrinsic = NULL;
502
503 if(a == bld->one || b == bld->one)
504 return bld->one;
505
506 if (type.width * type.length == 128 &&
507 !type.floating && !type.fixed) {
508 if(util_cpu_caps.has_sse2) {
509 if(type.width == 8)
510 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
511 if(type.width == 16)
512 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
513 } else if (util_cpu_caps.has_altivec) {
514 if(type.width == 8)
515 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
516 if(type.width == 16)
517 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
518 }
519 }
520
521 if(intrinsic)
522 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
523 }
524
525 if(type.norm && !type.floating && !type.fixed) {
526 if (type.sign) {
527 uint64_t sign = (uint64_t)1 << (type.width - 1);
528 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
529 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
530 /* a_clamp_max is the maximum a for positive b,
531 a_clamp_min is the minimum a for negative b. */
532 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
533 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
534 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
535 } else {
536 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
537 }
538 }
539
540 if(LLVMIsConstant(a) && LLVMIsConstant(b))
541 if (type.floating)
542 res = LLVMConstFAdd(a, b);
543 else
544 res = LLVMConstAdd(a, b);
545 else
546 if (type.floating)
547 res = LLVMBuildFAdd(builder, a, b, "");
548 else
549 res = LLVMBuildAdd(builder, a, b, "");
550
551 /* clamp to ceiling of 1.0 */
552 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
553 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
554
555 /* XXX clamp to floor of -1 or 0??? */
556
557 return res;
558 }
559
560
561 /** Return the scalar sum of the elements of a.
562 * Should avoid this operation whenever possible.
563 */
564 LLVMValueRef
565 lp_build_horizontal_add(struct lp_build_context *bld,
566 LLVMValueRef a)
567 {
568 LLVMBuilderRef builder = bld->gallivm->builder;
569 const struct lp_type type = bld->type;
570 LLVMValueRef index, res;
571 unsigned i, length;
572 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
573 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
574 LLVMValueRef vecres, elem2;
575
576 assert(lp_check_value(type, a));
577
578 if (type.length == 1) {
579 return a;
580 }
581
582 assert(!bld->type.norm);
583
584 /*
585 * for byte vectors can do much better with psadbw.
586 * Using repeated shuffle/adds here. Note with multiple vectors
587 * this can be done more efficiently as outlined in the intel
588 * optimization manual.
589 * Note: could cause data rearrangement if used with smaller element
590 * sizes.
591 */
592
593 vecres = a;
594 length = type.length / 2;
595 while (length > 1) {
596 LLVMValueRef vec1, vec2;
597 for (i = 0; i < length; i++) {
598 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
599 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
600 }
601 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
602 LLVMConstVector(shuffles1, length), "");
603 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
604 LLVMConstVector(shuffles2, length), "");
605 if (type.floating) {
606 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
607 }
608 else {
609 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
610 }
611 length = length >> 1;
612 }
613
614 /* always have vector of size 2 here */
615 assert(length == 1);
616
617 index = lp_build_const_int32(bld->gallivm, 0);
618 res = LLVMBuildExtractElement(builder, vecres, index, "");
619 index = lp_build_const_int32(bld->gallivm, 1);
620 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
621
622 if (type.floating)
623 res = LLVMBuildFAdd(builder, res, elem2, "");
624 else
625 res = LLVMBuildAdd(builder, res, elem2, "");
626
627 return res;
628 }
629
630 /**
631 * Return the horizontal sums of 4 float vectors as a float4 vector.
632 * This uses the technique as outlined in Intel Optimization Manual.
633 */
634 static LLVMValueRef
635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
636 LLVMValueRef src[4])
637 {
638 struct gallivm_state *gallivm = bld->gallivm;
639 LLVMBuilderRef builder = gallivm->builder;
640 LLVMValueRef shuffles[4];
641 LLVMValueRef tmp[4];
642 LLVMValueRef sumtmp[2], shuftmp[2];
643
644 /* lower half of regs */
645 shuffles[0] = lp_build_const_int32(gallivm, 0);
646 shuffles[1] = lp_build_const_int32(gallivm, 1);
647 shuffles[2] = lp_build_const_int32(gallivm, 4);
648 shuffles[3] = lp_build_const_int32(gallivm, 5);
649 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
650 LLVMConstVector(shuffles, 4), "");
651 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
652 LLVMConstVector(shuffles, 4), "");
653
654 /* upper half of regs */
655 shuffles[0] = lp_build_const_int32(gallivm, 2);
656 shuffles[1] = lp_build_const_int32(gallivm, 3);
657 shuffles[2] = lp_build_const_int32(gallivm, 6);
658 shuffles[3] = lp_build_const_int32(gallivm, 7);
659 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
660 LLVMConstVector(shuffles, 4), "");
661 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
662 LLVMConstVector(shuffles, 4), "");
663
664 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
665 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
666
667 shuffles[0] = lp_build_const_int32(gallivm, 0);
668 shuffles[1] = lp_build_const_int32(gallivm, 2);
669 shuffles[2] = lp_build_const_int32(gallivm, 4);
670 shuffles[3] = lp_build_const_int32(gallivm, 6);
671 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672 LLVMConstVector(shuffles, 4), "");
673
674 shuffles[0] = lp_build_const_int32(gallivm, 1);
675 shuffles[1] = lp_build_const_int32(gallivm, 3);
676 shuffles[2] = lp_build_const_int32(gallivm, 5);
677 shuffles[3] = lp_build_const_int32(gallivm, 7);
678 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
679 LLVMConstVector(shuffles, 4), "");
680
681 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
682 }
683
684
685 /*
686 * partially horizontally add 2-4 float vectors with length nx4,
687 * i.e. only four adjacent values in each vector will be added,
688 * assuming values are really grouped in 4 which also determines
689 * output order.
690 *
691 * Return a vector of the same length as the initial vectors,
692 * with the excess elements (if any) being undefined.
693 * The element order is independent of number of input vectors.
694 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
695 * the output order thus will be
696 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
697 */
698 LLVMValueRef
699 lp_build_hadd_partial4(struct lp_build_context *bld,
700 LLVMValueRef vectors[],
701 unsigned num_vecs)
702 {
703 struct gallivm_state *gallivm = bld->gallivm;
704 LLVMBuilderRef builder = gallivm->builder;
705 LLVMValueRef ret_vec;
706 LLVMValueRef tmp[4];
707 const char *intrinsic = NULL;
708
709 assert(num_vecs >= 2 && num_vecs <= 4);
710 assert(bld->type.floating);
711
712 /* only use this with at least 2 vectors, as it is sort of expensive
713 * (depending on cpu) and we always need two horizontal adds anyway,
714 * so a shuffle/add approach might be better.
715 */
716
717 tmp[0] = vectors[0];
718 tmp[1] = vectors[1];
719
720 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
721 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
722
723 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
724 bld->type.length == 4) {
725 intrinsic = "llvm.x86.sse3.hadd.ps";
726 }
727 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
728 bld->type.length == 8) {
729 intrinsic = "llvm.x86.avx.hadd.ps.256";
730 }
731 if (intrinsic) {
732 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
733 lp_build_vec_type(gallivm, bld->type),
734 tmp[0], tmp[1]);
735 if (num_vecs > 2) {
736 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
737 lp_build_vec_type(gallivm, bld->type),
738 tmp[2], tmp[3]);
739 }
740 else {
741 tmp[1] = tmp[0];
742 }
743 return lp_build_intrinsic_binary(builder, intrinsic,
744 lp_build_vec_type(gallivm, bld->type),
745 tmp[0], tmp[1]);
746 }
747
748 if (bld->type.length == 4) {
749 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
750 }
751 else {
752 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
753 unsigned j;
754 unsigned num_iter = bld->type.length / 4;
755 struct lp_type parttype = bld->type;
756 parttype.length = 4;
757 for (j = 0; j < num_iter; j++) {
758 LLVMValueRef partsrc[4];
759 unsigned i;
760 for (i = 0; i < 4; i++) {
761 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
762 }
763 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
764 }
765 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
766 }
767 return ret_vec;
768 }
769
770 /**
771 * Generate a - b
772 */
773 LLVMValueRef
774 lp_build_sub(struct lp_build_context *bld,
775 LLVMValueRef a,
776 LLVMValueRef b)
777 {
778 LLVMBuilderRef builder = bld->gallivm->builder;
779 const struct lp_type type = bld->type;
780 LLVMValueRef res;
781
782 assert(lp_check_value(type, a));
783 assert(lp_check_value(type, b));
784
785 if(b == bld->zero)
786 return a;
787 if(a == bld->undef || b == bld->undef)
788 return bld->undef;
789 if(a == b)
790 return bld->zero;
791
792 if(bld->type.norm) {
793 const char *intrinsic = NULL;
794
795 if(b == bld->one)
796 return bld->zero;
797
798 if (type.width * type.length == 128 &&
799 !type.floating && !type.fixed) {
800 if (util_cpu_caps.has_sse2) {
801 if(type.width == 8)
802 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
803 if(type.width == 16)
804 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
805 } else if (util_cpu_caps.has_altivec) {
806 if(type.width == 8)
807 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
808 if(type.width == 16)
809 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
810 }
811 }
812
813 if(intrinsic)
814 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
815 }
816
817 if(type.norm && !type.floating && !type.fixed) {
818 if (type.sign) {
819 uint64_t sign = (uint64_t)1 << (type.width - 1);
820 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
821 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
822 /* a_clamp_max is the maximum a for negative b,
823 a_clamp_min is the minimum a for positive b. */
824 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
825 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
826 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
827 } else {
828 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
829 }
830 }
831
832 if(LLVMIsConstant(a) && LLVMIsConstant(b))
833 if (type.floating)
834 res = LLVMConstFSub(a, b);
835 else
836 res = LLVMConstSub(a, b);
837 else
838 if (type.floating)
839 res = LLVMBuildFSub(builder, a, b, "");
840 else
841 res = LLVMBuildSub(builder, a, b, "");
842
843 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
844 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
845
846 return res;
847 }
848
849
850
851 /**
852 * Normalized multiplication.
853 *
854 * There are several approaches for (using 8-bit normalized multiplication as
855 * an example):
856 *
857 * - alpha plus one
858 *
859 * makes the following approximation to the division (Sree)
860 *
861 * a*b/255 ~= (a*(b + 1)) >> 256
862 *
863 * which is the fastest method that satisfies the following OpenGL criteria of
864 *
865 * 0*0 = 0 and 255*255 = 255
866 *
867 * - geometric series
868 *
869 * takes the geometric series approximation to the division
870 *
871 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
872 *
873 * in this case just the first two terms to fit in 16bit arithmetic
874 *
875 * t/255 ~= (t + (t >> 8)) >> 8
876 *
877 * note that just by itself it doesn't satisfies the OpenGL criteria, as
878 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
879 * must be used.
880 *
881 * - geometric series plus rounding
882 *
883 * when using a geometric series division instead of truncating the result
884 * use roundoff in the approximation (Jim Blinn)
885 *
886 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
887 *
888 * achieving the exact results.
889 *
890 *
891 *
892 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
893 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
894 * @sa Michael Herf, The "double blend trick", May 2000,
895 * http://www.stereopsis.com/doubleblend.html
896 */
897 static LLVMValueRef
898 lp_build_mul_norm(struct gallivm_state *gallivm,
899 struct lp_type wide_type,
900 LLVMValueRef a, LLVMValueRef b)
901 {
902 LLVMBuilderRef builder = gallivm->builder;
903 struct lp_build_context bld;
904 unsigned n;
905 LLVMValueRef half;
906 LLVMValueRef ab;
907
908 assert(!wide_type.floating);
909 assert(lp_check_value(wide_type, a));
910 assert(lp_check_value(wide_type, b));
911
912 lp_build_context_init(&bld, gallivm, wide_type);
913
914 n = wide_type.width / 2;
915 if (wide_type.sign) {
916 --n;
917 }
918
919 /*
920 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
921 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
922 */
923
924 /*
925 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
926 */
927
928 ab = LLVMBuildMul(builder, a, b, "");
929 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
930
931 /*
932 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
933 */
934
935 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
936 if (wide_type.sign) {
937 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
938 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
939 half = lp_build_select(&bld, sign, minus_half, half);
940 }
941 ab = LLVMBuildAdd(builder, ab, half, "");
942
943 /* Final division */
944 ab = lp_build_shr_imm(&bld, ab, n);
945
946 return ab;
947 }
948
949 /**
950 * Generate a * b
951 */
952 LLVMValueRef
953 lp_build_mul(struct lp_build_context *bld,
954 LLVMValueRef a,
955 LLVMValueRef b)
956 {
957 LLVMBuilderRef builder = bld->gallivm->builder;
958 const struct lp_type type = bld->type;
959 LLVMValueRef shift;
960 LLVMValueRef res;
961
962 assert(lp_check_value(type, a));
963 assert(lp_check_value(type, b));
964
965 if(a == bld->zero)
966 return bld->zero;
967 if(a == bld->one)
968 return b;
969 if(b == bld->zero)
970 return bld->zero;
971 if(b == bld->one)
972 return a;
973 if(a == bld->undef || b == bld->undef)
974 return bld->undef;
975
976 if (!type.floating && !type.fixed && type.norm) {
977 struct lp_type wide_type = lp_wider_type(type);
978 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
979
980 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
981 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
982
983 /* PMULLW, PSRLW, PADDW */
984 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
985 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
986
987 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
988
989 return ab;
990 }
991
992 if(type.fixed)
993 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
994 else
995 shift = NULL;
996
997 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
998 if (type.floating)
999 res = LLVMConstFMul(a, b);
1000 else
1001 res = LLVMConstMul(a, b);
1002 if(shift) {
1003 if(type.sign)
1004 res = LLVMConstAShr(res, shift);
1005 else
1006 res = LLVMConstLShr(res, shift);
1007 }
1008 }
1009 else {
1010 if (type.floating)
1011 res = LLVMBuildFMul(builder, a, b, "");
1012 else
1013 res = LLVMBuildMul(builder, a, b, "");
1014 if(shift) {
1015 if(type.sign)
1016 res = LLVMBuildAShr(builder, res, shift, "");
1017 else
1018 res = LLVMBuildLShr(builder, res, shift, "");
1019 }
1020 }
1021
1022 return res;
1023 }
1024
1025
1026 /**
1027 * Small vector x scale multiplication optimization.
1028 */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031 LLVMValueRef a,
1032 int b)
1033 {
1034 LLVMBuilderRef builder = bld->gallivm->builder;
1035 LLVMValueRef factor;
1036
1037 assert(lp_check_value(bld->type, a));
1038
1039 if(b == 0)
1040 return bld->zero;
1041
1042 if(b == 1)
1043 return a;
1044
1045 if(b == -1)
1046 return lp_build_negate(bld, a);
1047
1048 if(b == 2 && bld->type.floating)
1049 return lp_build_add(bld, a, a);
1050
1051 if(util_is_power_of_two(b)) {
1052 unsigned shift = ffs(b) - 1;
1053
1054 if(bld->type.floating) {
1055 #if 0
1056 /*
1057 * Power of two multiplication by directly manipulating the exponent.
1058 *
1059 * XXX: This might not be always faster, it will introduce a small error
1060 * for multiplication by zero, and it will produce wrong results
1061 * for Inf and NaN.
1062 */
1063 unsigned mantissa = lp_mantissa(bld->type);
1064 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066 a = LLVMBuildAdd(builder, a, factor, "");
1067 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068 return a;
1069 #endif
1070 }
1071 else {
1072 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073 return LLVMBuildShl(builder, a, factor, "");
1074 }
1075 }
1076
1077 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078 return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083 * Generate a / b
1084 */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087 LLVMValueRef a,
1088 LLVMValueRef b)
1089 {
1090 LLVMBuilderRef builder = bld->gallivm->builder;
1091 const struct lp_type type = bld->type;
1092
1093 assert(lp_check_value(type, a));
1094 assert(lp_check_value(type, b));
1095
1096 if(a == bld->zero)
1097 return bld->zero;
1098 if(a == bld->one && type.floating)
1099 return lp_build_rcp(bld, b);
1100 if(b == bld->zero)
1101 return bld->undef;
1102 if(b == bld->one)
1103 return a;
1104 if(a == bld->undef || b == bld->undef)
1105 return bld->undef;
1106
1107 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108 if (type.floating)
1109 return LLVMConstFDiv(a, b);
1110 else if (type.sign)
1111 return LLVMConstSDiv(a, b);
1112 else
1113 return LLVMConstUDiv(a, b);
1114 }
1115
1116 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118 type.floating)
1119 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121 if (type.floating)
1122 return LLVMBuildFDiv(builder, a, b, "");
1123 else if (type.sign)
1124 return LLVMBuildSDiv(builder, a, b, "");
1125 else
1126 return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131 * Linear interpolation helper.
1132 *
1133 * @param normalized whether we are interpolating normalized values,
1134 * encoded in normalized integers, twice as wide.
1135 *
1136 * @sa http://www.stereopsis.com/doubleblend.html
1137 */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140 LLVMValueRef x,
1141 LLVMValueRef v0,
1142 LLVMValueRef v1,
1143 unsigned flags)
1144 {
1145 unsigned half_width = bld->type.width/2;
1146 LLVMBuilderRef builder = bld->gallivm->builder;
1147 LLVMValueRef delta;
1148 LLVMValueRef res;
1149
1150 assert(lp_check_value(bld->type, x));
1151 assert(lp_check_value(bld->type, v0));
1152 assert(lp_check_value(bld->type, v1));
1153
1154 delta = lp_build_sub(bld, v1, v0);
1155
1156 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157 if (!bld->type.sign) {
1158 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159 /*
1160 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161 * most-significant-bit to the lowest-significant-bit, so that
1162 * later we can just divide by 2**n instead of 2**n - 1.
1163 */
1164
1165 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166 }
1167
1168 /* (x * delta) >> n */
1169 res = lp_build_mul(bld, x, delta);
1170 res = lp_build_shr_imm(bld, res, half_width);
1171 } else {
1172 /*
1173 * The rescaling trick above doesn't work for signed numbers, so
1174 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175 * instead.
1176 */
1177 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179 }
1180 } else {
1181 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182 res = lp_build_mul(bld, x, delta);
1183 }
1184
1185 res = lp_build_add(bld, v0, res);
1186
1187 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1188 bld->type.fixed) {
1189 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1190 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1191 * but it will be wrong for true fixed point use cases. Basically we need
1192 * a more powerful lp_type, capable of further distinguishing the values
1193 * interpretation from the value storage. */
1194 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1195 }
1196
1197 return res;
1198 }
1199
1200
1201 /**
1202 * Linear interpolation.
1203 */
1204 LLVMValueRef
1205 lp_build_lerp(struct lp_build_context *bld,
1206 LLVMValueRef x,
1207 LLVMValueRef v0,
1208 LLVMValueRef v1,
1209 unsigned flags)
1210 {
1211 const struct lp_type type = bld->type;
1212 LLVMValueRef res;
1213
1214 assert(lp_check_value(type, x));
1215 assert(lp_check_value(type, v0));
1216 assert(lp_check_value(type, v1));
1217
1218 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1219
1220 if (type.norm) {
1221 struct lp_type wide_type;
1222 struct lp_build_context wide_bld;
1223 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1224
1225 assert(type.length >= 2);
1226
1227 /*
1228 * Create a wider integer type, enough to hold the
1229 * intermediate result of the multiplication.
1230 */
1231 memset(&wide_type, 0, sizeof wide_type);
1232 wide_type.sign = type.sign;
1233 wide_type.width = type.width*2;
1234 wide_type.length = type.length/2;
1235
1236 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1237
1238 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1239 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1240 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1241
1242 /*
1243 * Lerp both halves.
1244 */
1245
1246 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1247
1248 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1249 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1250
1251 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1252 } else {
1253 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1254 }
1255
1256 return res;
1257 }
1258
1259
1260 /**
1261 * Bilinear interpolation.
1262 *
1263 * Values indices are in v_{yx}.
1264 */
1265 LLVMValueRef
1266 lp_build_lerp_2d(struct lp_build_context *bld,
1267 LLVMValueRef x,
1268 LLVMValueRef y,
1269 LLVMValueRef v00,
1270 LLVMValueRef v01,
1271 LLVMValueRef v10,
1272 LLVMValueRef v11,
1273 unsigned flags)
1274 {
1275 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1276 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1277 return lp_build_lerp(bld, y, v0, v1, flags);
1278 }
1279
1280
1281 LLVMValueRef
1282 lp_build_lerp_3d(struct lp_build_context *bld,
1283 LLVMValueRef x,
1284 LLVMValueRef y,
1285 LLVMValueRef z,
1286 LLVMValueRef v000,
1287 LLVMValueRef v001,
1288 LLVMValueRef v010,
1289 LLVMValueRef v011,
1290 LLVMValueRef v100,
1291 LLVMValueRef v101,
1292 LLVMValueRef v110,
1293 LLVMValueRef v111,
1294 unsigned flags)
1295 {
1296 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1297 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1298 return lp_build_lerp(bld, z, v0, v1, flags);
1299 }
1300
1301
1302 /**
1303 * Generate min(a, b)
1304 * Do checks for special cases but not for nans.
1305 */
1306 LLVMValueRef
1307 lp_build_min(struct lp_build_context *bld,
1308 LLVMValueRef a,
1309 LLVMValueRef b)
1310 {
1311 assert(lp_check_value(bld->type, a));
1312 assert(lp_check_value(bld->type, b));
1313
1314 if(a == bld->undef || b == bld->undef)
1315 return bld->undef;
1316
1317 if(a == b)
1318 return a;
1319
1320 if (bld->type.norm) {
1321 if (!bld->type.sign) {
1322 if (a == bld->zero || b == bld->zero) {
1323 return bld->zero;
1324 }
1325 }
1326 if(a == bld->one)
1327 return b;
1328 if(b == bld->one)
1329 return a;
1330 }
1331
1332 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1333 }
1334
1335
1336 /**
1337 * Generate min(a, b)
1338 * NaN's are handled according to the behavior specified by the
1339 * nan_behavior argument.
1340 */
1341 LLVMValueRef
1342 lp_build_min_ext(struct lp_build_context *bld,
1343 LLVMValueRef a,
1344 LLVMValueRef b,
1345 enum gallivm_nan_behavior nan_behavior)
1346 {
1347 assert(lp_check_value(bld->type, a));
1348 assert(lp_check_value(bld->type, b));
1349
1350 if(a == bld->undef || b == bld->undef)
1351 return bld->undef;
1352
1353 if(a == b)
1354 return a;
1355
1356 if (bld->type.norm) {
1357 if (!bld->type.sign) {
1358 if (a == bld->zero || b == bld->zero) {
1359 return bld->zero;
1360 }
1361 }
1362 if(a == bld->one)
1363 return b;
1364 if(b == bld->one)
1365 return a;
1366 }
1367
1368 return lp_build_min_simple(bld, a, b, nan_behavior);
1369 }
1370
1371 /**
1372 * Generate max(a, b)
1373 * Do checks for special cases, but NaN behavior is undefined.
1374 */
1375 LLVMValueRef
1376 lp_build_max(struct lp_build_context *bld,
1377 LLVMValueRef a,
1378 LLVMValueRef b)
1379 {
1380 assert(lp_check_value(bld->type, a));
1381 assert(lp_check_value(bld->type, b));
1382
1383 if(a == bld->undef || b == bld->undef)
1384 return bld->undef;
1385
1386 if(a == b)
1387 return a;
1388
1389 if(bld->type.norm) {
1390 if(a == bld->one || b == bld->one)
1391 return bld->one;
1392 if (!bld->type.sign) {
1393 if (a == bld->zero) {
1394 return b;
1395 }
1396 if (b == bld->zero) {
1397 return a;
1398 }
1399 }
1400 }
1401
1402 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1403 }
1404
1405
1406 /**
1407 * Generate max(a, b)
1408 * Checks for special cases.
1409 * NaN's are handled according to the behavior specified by the
1410 * nan_behavior argument.
1411 */
1412 LLVMValueRef
1413 lp_build_max_ext(struct lp_build_context *bld,
1414 LLVMValueRef a,
1415 LLVMValueRef b,
1416 enum gallivm_nan_behavior nan_behavior)
1417 {
1418 assert(lp_check_value(bld->type, a));
1419 assert(lp_check_value(bld->type, b));
1420
1421 if(a == bld->undef || b == bld->undef)
1422 return bld->undef;
1423
1424 if(a == b)
1425 return a;
1426
1427 if(bld->type.norm) {
1428 if(a == bld->one || b == bld->one)
1429 return bld->one;
1430 if (!bld->type.sign) {
1431 if (a == bld->zero) {
1432 return b;
1433 }
1434 if (b == bld->zero) {
1435 return a;
1436 }
1437 }
1438 }
1439
1440 return lp_build_max_simple(bld, a, b, nan_behavior);
1441 }
1442
1443 /**
1444 * Generate clamp(a, min, max)
1445 * NaN behavior (for any of a, min, max) is undefined.
1446 * Do checks for special cases.
1447 */
1448 LLVMValueRef
1449 lp_build_clamp(struct lp_build_context *bld,
1450 LLVMValueRef a,
1451 LLVMValueRef min,
1452 LLVMValueRef max)
1453 {
1454 assert(lp_check_value(bld->type, a));
1455 assert(lp_check_value(bld->type, min));
1456 assert(lp_check_value(bld->type, max));
1457
1458 a = lp_build_min(bld, a, max);
1459 a = lp_build_max(bld, a, min);
1460 return a;
1461 }
1462
1463
1464 /**
1465 * Generate clamp(a, 0, 1)
1466 * A NaN will get converted to zero.
1467 */
1468 LLVMValueRef
1469 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1470 LLVMValueRef a)
1471 {
1472 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1473 a = lp_build_min(bld, a, bld->one);
1474 return a;
1475 }
1476
1477
1478 /**
1479 * Generate abs(a)
1480 */
1481 LLVMValueRef
1482 lp_build_abs(struct lp_build_context *bld,
1483 LLVMValueRef a)
1484 {
1485 LLVMBuilderRef builder = bld->gallivm->builder;
1486 const struct lp_type type = bld->type;
1487 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1488
1489 assert(lp_check_value(type, a));
1490
1491 if(!type.sign)
1492 return a;
1493
1494 if(type.floating) {
1495 /* Mask out the sign bit */
1496 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1497 unsigned long long absMask = ~(1ULL << (type.width - 1));
1498 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1499 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1500 a = LLVMBuildAnd(builder, a, mask, "");
1501 a = LLVMBuildBitCast(builder, a, vec_type, "");
1502 return a;
1503 }
1504
1505 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1506 switch(type.width) {
1507 case 8:
1508 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1509 case 16:
1510 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1511 case 32:
1512 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1513 }
1514 }
1515 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1516 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1517 (type.width == 8 || type.width == 16 || type.width == 32)) {
1518 debug_printf("%s: inefficient code, should split vectors manually\n",
1519 __FUNCTION__);
1520 }
1521
1522 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1523 }
1524
1525
1526 LLVMValueRef
1527 lp_build_negate(struct lp_build_context *bld,
1528 LLVMValueRef a)
1529 {
1530 LLVMBuilderRef builder = bld->gallivm->builder;
1531
1532 assert(lp_check_value(bld->type, a));
1533
1534 if (bld->type.floating)
1535 a = LLVMBuildFNeg(builder, a, "");
1536 else
1537 a = LLVMBuildNeg(builder, a, "");
1538
1539 return a;
1540 }
1541
1542
1543 /** Return -1, 0 or +1 depending on the sign of a */
1544 LLVMValueRef
1545 lp_build_sgn(struct lp_build_context *bld,
1546 LLVMValueRef a)
1547 {
1548 LLVMBuilderRef builder = bld->gallivm->builder;
1549 const struct lp_type type = bld->type;
1550 LLVMValueRef cond;
1551 LLVMValueRef res;
1552
1553 assert(lp_check_value(type, a));
1554
1555 /* Handle non-zero case */
1556 if(!type.sign) {
1557 /* if not zero then sign must be positive */
1558 res = bld->one;
1559 }
1560 else if(type.floating) {
1561 LLVMTypeRef vec_type;
1562 LLVMTypeRef int_type;
1563 LLVMValueRef mask;
1564 LLVMValueRef sign;
1565 LLVMValueRef one;
1566 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1567
1568 int_type = lp_build_int_vec_type(bld->gallivm, type);
1569 vec_type = lp_build_vec_type(bld->gallivm, type);
1570 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1571
1572 /* Take the sign bit and add it to 1 constant */
1573 sign = LLVMBuildBitCast(builder, a, int_type, "");
1574 sign = LLVMBuildAnd(builder, sign, mask, "");
1575 one = LLVMConstBitCast(bld->one, int_type);
1576 res = LLVMBuildOr(builder, sign, one, "");
1577 res = LLVMBuildBitCast(builder, res, vec_type, "");
1578 }
1579 else
1580 {
1581 /* signed int/norm/fixed point */
1582 /* could use psign with sse3 and appropriate vectors here */
1583 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1584 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1585 res = lp_build_select(bld, cond, bld->one, minus_one);
1586 }
1587
1588 /* Handle zero */
1589 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1590 res = lp_build_select(bld, cond, bld->zero, res);
1591
1592 return res;
1593 }
1594
1595
1596 /**
1597 * Set the sign of float vector 'a' according to 'sign'.
1598 * If sign==0, return abs(a).
1599 * If sign==1, return -abs(a);
1600 * Other values for sign produce undefined results.
1601 */
1602 LLVMValueRef
1603 lp_build_set_sign(struct lp_build_context *bld,
1604 LLVMValueRef a, LLVMValueRef sign)
1605 {
1606 LLVMBuilderRef builder = bld->gallivm->builder;
1607 const struct lp_type type = bld->type;
1608 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1609 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1610 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1611 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1612 ~((unsigned long long) 1 << (type.width - 1)));
1613 LLVMValueRef val, res;
1614
1615 assert(type.floating);
1616 assert(lp_check_value(type, a));
1617
1618 /* val = reinterpret_cast<int>(a) */
1619 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1620 /* val = val & mask */
1621 val = LLVMBuildAnd(builder, val, mask, "");
1622 /* sign = sign << shift */
1623 sign = LLVMBuildShl(builder, sign, shift, "");
1624 /* res = val | sign */
1625 res = LLVMBuildOr(builder, val, sign, "");
1626 /* res = reinterpret_cast<float>(res) */
1627 res = LLVMBuildBitCast(builder, res, vec_type, "");
1628
1629 return res;
1630 }
1631
1632
1633 /**
1634 * Convert vector of (or scalar) int to vector of (or scalar) float.
1635 */
1636 LLVMValueRef
1637 lp_build_int_to_float(struct lp_build_context *bld,
1638 LLVMValueRef a)
1639 {
1640 LLVMBuilderRef builder = bld->gallivm->builder;
1641 const struct lp_type type = bld->type;
1642 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1643
1644 assert(type.floating);
1645
1646 return LLVMBuildSIToFP(builder, a, vec_type, "");
1647 }
1648
1649 static boolean
1650 arch_rounding_available(const struct lp_type type)
1651 {
1652 if ((util_cpu_caps.has_sse4_1 &&
1653 (type.length == 1 || type.width*type.length == 128)) ||
1654 (util_cpu_caps.has_avx && type.width*type.length == 256))
1655 return TRUE;
1656 else if ((util_cpu_caps.has_altivec &&
1657 (type.width == 32 && type.length == 4)))
1658 return TRUE;
1659
1660 return FALSE;
1661 }
1662
1663 enum lp_build_round_mode
1664 {
1665 LP_BUILD_ROUND_NEAREST = 0,
1666 LP_BUILD_ROUND_FLOOR = 1,
1667 LP_BUILD_ROUND_CEIL = 2,
1668 LP_BUILD_ROUND_TRUNCATE = 3
1669 };
1670
1671 /**
1672 * Helper for SSE4.1's ROUNDxx instructions.
1673 *
1674 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1675 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1676 */
1677 static inline LLVMValueRef
1678 lp_build_round_sse41(struct lp_build_context *bld,
1679 LLVMValueRef a,
1680 enum lp_build_round_mode mode)
1681 {
1682 LLVMBuilderRef builder = bld->gallivm->builder;
1683 const struct lp_type type = bld->type;
1684 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1685 const char *intrinsic;
1686 LLVMValueRef res;
1687
1688 assert(type.floating);
1689
1690 assert(lp_check_value(type, a));
1691 assert(util_cpu_caps.has_sse4_1);
1692
1693 if (type.length == 1) {
1694 LLVMTypeRef vec_type;
1695 LLVMValueRef undef;
1696 LLVMValueRef args[3];
1697 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1698
1699 switch(type.width) {
1700 case 32:
1701 intrinsic = "llvm.x86.sse41.round.ss";
1702 break;
1703 case 64:
1704 intrinsic = "llvm.x86.sse41.round.sd";
1705 break;
1706 default:
1707 assert(0);
1708 return bld->undef;
1709 }
1710
1711 vec_type = LLVMVectorType(bld->elem_type, 4);
1712
1713 undef = LLVMGetUndef(vec_type);
1714
1715 args[0] = undef;
1716 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1717 args[2] = LLVMConstInt(i32t, mode, 0);
1718
1719 res = lp_build_intrinsic(builder, intrinsic,
1720 vec_type, args, Elements(args), 0);
1721
1722 res = LLVMBuildExtractElement(builder, res, index0, "");
1723 }
1724 else {
1725 if (type.width * type.length == 128) {
1726 switch(type.width) {
1727 case 32:
1728 intrinsic = "llvm.x86.sse41.round.ps";
1729 break;
1730 case 64:
1731 intrinsic = "llvm.x86.sse41.round.pd";
1732 break;
1733 default:
1734 assert(0);
1735 return bld->undef;
1736 }
1737 }
1738 else {
1739 assert(type.width * type.length == 256);
1740 assert(util_cpu_caps.has_avx);
1741
1742 switch(type.width) {
1743 case 32:
1744 intrinsic = "llvm.x86.avx.round.ps.256";
1745 break;
1746 case 64:
1747 intrinsic = "llvm.x86.avx.round.pd.256";
1748 break;
1749 default:
1750 assert(0);
1751 return bld->undef;
1752 }
1753 }
1754
1755 res = lp_build_intrinsic_binary(builder, intrinsic,
1756 bld->vec_type, a,
1757 LLVMConstInt(i32t, mode, 0));
1758 }
1759
1760 return res;
1761 }
1762
1763
1764 static inline LLVMValueRef
1765 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1766 LLVMValueRef a)
1767 {
1768 LLVMBuilderRef builder = bld->gallivm->builder;
1769 const struct lp_type type = bld->type;
1770 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1771 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1772 const char *intrinsic;
1773 LLVMValueRef res;
1774
1775 assert(type.floating);
1776 /* using the double precision conversions is a bit more complicated */
1777 assert(type.width == 32);
1778
1779 assert(lp_check_value(type, a));
1780 assert(util_cpu_caps.has_sse2);
1781
1782 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1783 if (type.length == 1) {
1784 LLVMTypeRef vec_type;
1785 LLVMValueRef undef;
1786 LLVMValueRef arg;
1787 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1788
1789 vec_type = LLVMVectorType(bld->elem_type, 4);
1790
1791 intrinsic = "llvm.x86.sse.cvtss2si";
1792
1793 undef = LLVMGetUndef(vec_type);
1794
1795 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1796
1797 res = lp_build_intrinsic_unary(builder, intrinsic,
1798 ret_type, arg);
1799 }
1800 else {
1801 if (type.width* type.length == 128) {
1802 intrinsic = "llvm.x86.sse2.cvtps2dq";
1803 }
1804 else {
1805 assert(type.width*type.length == 256);
1806 assert(util_cpu_caps.has_avx);
1807
1808 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1809 }
1810 res = lp_build_intrinsic_unary(builder, intrinsic,
1811 ret_type, a);
1812 }
1813
1814 return res;
1815 }
1816
1817
1818 /*
1819 */
1820 static inline LLVMValueRef
1821 lp_build_round_altivec(struct lp_build_context *bld,
1822 LLVMValueRef a,
1823 enum lp_build_round_mode mode)
1824 {
1825 LLVMBuilderRef builder = bld->gallivm->builder;
1826 const struct lp_type type = bld->type;
1827 const char *intrinsic = NULL;
1828
1829 assert(type.floating);
1830
1831 assert(lp_check_value(type, a));
1832 assert(util_cpu_caps.has_altivec);
1833
1834 (void)type;
1835
1836 switch (mode) {
1837 case LP_BUILD_ROUND_NEAREST:
1838 intrinsic = "llvm.ppc.altivec.vrfin";
1839 break;
1840 case LP_BUILD_ROUND_FLOOR:
1841 intrinsic = "llvm.ppc.altivec.vrfim";
1842 break;
1843 case LP_BUILD_ROUND_CEIL:
1844 intrinsic = "llvm.ppc.altivec.vrfip";
1845 break;
1846 case LP_BUILD_ROUND_TRUNCATE:
1847 intrinsic = "llvm.ppc.altivec.vrfiz";
1848 break;
1849 }
1850
1851 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1852 }
1853
1854 static inline LLVMValueRef
1855 lp_build_round_arch(struct lp_build_context *bld,
1856 LLVMValueRef a,
1857 enum lp_build_round_mode mode)
1858 {
1859 if (util_cpu_caps.has_sse4_1)
1860 return lp_build_round_sse41(bld, a, mode);
1861 else /* (util_cpu_caps.has_altivec) */
1862 return lp_build_round_altivec(bld, a, mode);
1863 }
1864
1865 /**
1866 * Return the integer part of a float (vector) value (== round toward zero).
1867 * The returned value is a float (vector).
1868 * Ex: trunc(-1.5) = -1.0
1869 */
1870 LLVMValueRef
1871 lp_build_trunc(struct lp_build_context *bld,
1872 LLVMValueRef a)
1873 {
1874 LLVMBuilderRef builder = bld->gallivm->builder;
1875 const struct lp_type type = bld->type;
1876
1877 assert(type.floating);
1878 assert(lp_check_value(type, a));
1879
1880 if (arch_rounding_available(type)) {
1881 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1882 }
1883 else {
1884 const struct lp_type type = bld->type;
1885 struct lp_type inttype;
1886 struct lp_build_context intbld;
1887 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1888 LLVMValueRef trunc, res, anosign, mask;
1889 LLVMTypeRef int_vec_type = bld->int_vec_type;
1890 LLVMTypeRef vec_type = bld->vec_type;
1891
1892 assert(type.width == 32); /* might want to handle doubles at some point */
1893
1894 inttype = type;
1895 inttype.floating = 0;
1896 lp_build_context_init(&intbld, bld->gallivm, inttype);
1897
1898 /* round by truncation */
1899 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1900 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1901
1902 /* mask out sign bit */
1903 anosign = lp_build_abs(bld, a);
1904 /*
1905 * mask out all values if anosign > 2^24
1906 * This should work both for large ints (all rounding is no-op for them
1907 * because such floats are always exact) as well as special cases like
1908 * NaNs, Infs (taking advantage of the fact they use max exponent).
1909 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1910 */
1911 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1912 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1913 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1914 return lp_build_select(bld, mask, a, res);
1915 }
1916 }
1917
1918
1919 /**
1920 * Return float (vector) rounded to nearest integer (vector). The returned
1921 * value is a float (vector).
1922 * Ex: round(0.9) = 1.0
1923 * Ex: round(-1.5) = -2.0
1924 */
1925 LLVMValueRef
1926 lp_build_round(struct lp_build_context *bld,
1927 LLVMValueRef a)
1928 {
1929 LLVMBuilderRef builder = bld->gallivm->builder;
1930 const struct lp_type type = bld->type;
1931
1932 assert(type.floating);
1933 assert(lp_check_value(type, a));
1934
1935 if (arch_rounding_available(type)) {
1936 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1937 }
1938 else {
1939 const struct lp_type type = bld->type;
1940 struct lp_type inttype;
1941 struct lp_build_context intbld;
1942 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1943 LLVMValueRef res, anosign, mask;
1944 LLVMTypeRef int_vec_type = bld->int_vec_type;
1945 LLVMTypeRef vec_type = bld->vec_type;
1946
1947 assert(type.width == 32); /* might want to handle doubles at some point */
1948
1949 inttype = type;
1950 inttype.floating = 0;
1951 lp_build_context_init(&intbld, bld->gallivm, inttype);
1952
1953 res = lp_build_iround(bld, a);
1954 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1955
1956 /* mask out sign bit */
1957 anosign = lp_build_abs(bld, a);
1958 /*
1959 * mask out all values if anosign > 2^24
1960 * This should work both for large ints (all rounding is no-op for them
1961 * because such floats are always exact) as well as special cases like
1962 * NaNs, Infs (taking advantage of the fact they use max exponent).
1963 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1964 */
1965 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1966 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1967 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1968 return lp_build_select(bld, mask, a, res);
1969 }
1970 }
1971
1972
1973 /**
1974 * Return floor of float (vector), result is a float (vector)
1975 * Ex: floor(1.1) = 1.0
1976 * Ex: floor(-1.1) = -2.0
1977 */
1978 LLVMValueRef
1979 lp_build_floor(struct lp_build_context *bld,
1980 LLVMValueRef a)
1981 {
1982 LLVMBuilderRef builder = bld->gallivm->builder;
1983 const struct lp_type type = bld->type;
1984
1985 assert(type.floating);
1986 assert(lp_check_value(type, a));
1987
1988 if (arch_rounding_available(type)) {
1989 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1990 }
1991 else {
1992 const struct lp_type type = bld->type;
1993 struct lp_type inttype;
1994 struct lp_build_context intbld;
1995 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1996 LLVMValueRef trunc, res, anosign, mask;
1997 LLVMTypeRef int_vec_type = bld->int_vec_type;
1998 LLVMTypeRef vec_type = bld->vec_type;
1999
2000 if (type.width != 32) {
2001 char intrinsic[32];
2002 util_snprintf(intrinsic, sizeof intrinsic, "llvm.floor.v%uf%u", type.length, type.width);
2003 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2004 }
2005
2006 assert(type.width == 32); /* might want to handle doubles at some point */
2007
2008 inttype = type;
2009 inttype.floating = 0;
2010 lp_build_context_init(&intbld, bld->gallivm, inttype);
2011
2012 /* round by truncation */
2013 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2014 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2015
2016 if (type.sign) {
2017 LLVMValueRef tmp;
2018
2019 /*
2020 * fix values if rounding is wrong (for non-special cases)
2021 * - this is the case if trunc > a
2022 */
2023 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2024 /* tmp = trunc > a ? 1.0 : 0.0 */
2025 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2026 tmp = lp_build_and(&intbld, mask, tmp);
2027 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2028 res = lp_build_sub(bld, res, tmp);
2029 }
2030
2031 /* mask out sign bit */
2032 anosign = lp_build_abs(bld, a);
2033 /*
2034 * mask out all values if anosign > 2^24
2035 * This should work both for large ints (all rounding is no-op for them
2036 * because such floats are always exact) as well as special cases like
2037 * NaNs, Infs (taking advantage of the fact they use max exponent).
2038 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2039 */
2040 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2041 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2042 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2043 return lp_build_select(bld, mask, a, res);
2044 }
2045 }
2046
2047
2048 /**
2049 * Return ceiling of float (vector), returning float (vector).
2050 * Ex: ceil( 1.1) = 2.0
2051 * Ex: ceil(-1.1) = -1.0
2052 */
2053 LLVMValueRef
2054 lp_build_ceil(struct lp_build_context *bld,
2055 LLVMValueRef a)
2056 {
2057 LLVMBuilderRef builder = bld->gallivm->builder;
2058 const struct lp_type type = bld->type;
2059
2060 assert(type.floating);
2061 assert(lp_check_value(type, a));
2062
2063 if (arch_rounding_available(type)) {
2064 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2065 }
2066 else {
2067 const struct lp_type type = bld->type;
2068 struct lp_type inttype;
2069 struct lp_build_context intbld;
2070 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2071 LLVMValueRef trunc, res, anosign, mask, tmp;
2072 LLVMTypeRef int_vec_type = bld->int_vec_type;
2073 LLVMTypeRef vec_type = bld->vec_type;
2074
2075 if (type.width != 32) {
2076 char intrinsic[32];
2077 util_snprintf(intrinsic, sizeof intrinsic, "llvm.ceil.v%uf%u", type.length, type.width);
2078 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2079 }
2080
2081 assert(type.width == 32); /* might want to handle doubles at some point */
2082
2083 inttype = type;
2084 inttype.floating = 0;
2085 lp_build_context_init(&intbld, bld->gallivm, inttype);
2086
2087 /* round by truncation */
2088 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2089 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2090
2091 /*
2092 * fix values if rounding is wrong (for non-special cases)
2093 * - this is the case if trunc < a
2094 */
2095 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2096 /* tmp = trunc < a ? 1.0 : 0.0 */
2097 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2098 tmp = lp_build_and(&intbld, mask, tmp);
2099 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2100 res = lp_build_add(bld, trunc, tmp);
2101
2102 /* mask out sign bit */
2103 anosign = lp_build_abs(bld, a);
2104 /*
2105 * mask out all values if anosign > 2^24
2106 * This should work both for large ints (all rounding is no-op for them
2107 * because such floats are always exact) as well as special cases like
2108 * NaNs, Infs (taking advantage of the fact they use max exponent).
2109 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2110 */
2111 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2112 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2113 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2114 return lp_build_select(bld, mask, a, res);
2115 }
2116 }
2117
2118
2119 /**
2120 * Return fractional part of 'a' computed as a - floor(a)
2121 * Typically used in texture coord arithmetic.
2122 */
2123 LLVMValueRef
2124 lp_build_fract(struct lp_build_context *bld,
2125 LLVMValueRef a)
2126 {
2127 assert(bld->type.floating);
2128 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2129 }
2130
2131
2132 /**
2133 * Prevent returning a fractional part of 1.0 for very small negative values of
2134 * 'a' by clamping against 0.99999(9).
2135 */
2136 static inline LLVMValueRef
2137 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2138 {
2139 LLVMValueRef max;
2140
2141 /* this is the largest number smaller than 1.0 representable as float */
2142 max = lp_build_const_vec(bld->gallivm, bld->type,
2143 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2144 return lp_build_min(bld, fract, max);
2145 }
2146
2147
2148 /**
2149 * Same as lp_build_fract, but guarantees that the result is always smaller
2150 * than one.
2151 */
2152 LLVMValueRef
2153 lp_build_fract_safe(struct lp_build_context *bld,
2154 LLVMValueRef a)
2155 {
2156 return clamp_fract(bld, lp_build_fract(bld, a));
2157 }
2158
2159
2160 /**
2161 * Return the integer part of a float (vector) value (== round toward zero).
2162 * The returned value is an integer (vector).
2163 * Ex: itrunc(-1.5) = -1
2164 */
2165 LLVMValueRef
2166 lp_build_itrunc(struct lp_build_context *bld,
2167 LLVMValueRef a)
2168 {
2169 LLVMBuilderRef builder = bld->gallivm->builder;
2170 const struct lp_type type = bld->type;
2171 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2172
2173 assert(type.floating);
2174 assert(lp_check_value(type, a));
2175
2176 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2177 }
2178
2179
2180 /**
2181 * Return float (vector) rounded to nearest integer (vector). The returned
2182 * value is an integer (vector).
2183 * Ex: iround(0.9) = 1
2184 * Ex: iround(-1.5) = -2
2185 */
2186 LLVMValueRef
2187 lp_build_iround(struct lp_build_context *bld,
2188 LLVMValueRef a)
2189 {
2190 LLVMBuilderRef builder = bld->gallivm->builder;
2191 const struct lp_type type = bld->type;
2192 LLVMTypeRef int_vec_type = bld->int_vec_type;
2193 LLVMValueRef res;
2194
2195 assert(type.floating);
2196
2197 assert(lp_check_value(type, a));
2198
2199 if ((util_cpu_caps.has_sse2 &&
2200 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2201 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2202 return lp_build_iround_nearest_sse2(bld, a);
2203 }
2204 if (arch_rounding_available(type)) {
2205 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2206 }
2207 else {
2208 LLVMValueRef half;
2209
2210 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2211
2212 if (type.sign) {
2213 LLVMTypeRef vec_type = bld->vec_type;
2214 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2215 (unsigned long long)1 << (type.width - 1));
2216 LLVMValueRef sign;
2217
2218 /* get sign bit */
2219 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2220 sign = LLVMBuildAnd(builder, sign, mask, "");
2221
2222 /* sign * 0.5 */
2223 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2224 half = LLVMBuildOr(builder, sign, half, "");
2225 half = LLVMBuildBitCast(builder, half, vec_type, "");
2226 }
2227
2228 res = LLVMBuildFAdd(builder, a, half, "");
2229 }
2230
2231 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2232
2233 return res;
2234 }
2235
2236
2237 /**
2238 * Return floor of float (vector), result is an int (vector)
2239 * Ex: ifloor(1.1) = 1.0
2240 * Ex: ifloor(-1.1) = -2.0
2241 */
2242 LLVMValueRef
2243 lp_build_ifloor(struct lp_build_context *bld,
2244 LLVMValueRef a)
2245 {
2246 LLVMBuilderRef builder = bld->gallivm->builder;
2247 const struct lp_type type = bld->type;
2248 LLVMTypeRef int_vec_type = bld->int_vec_type;
2249 LLVMValueRef res;
2250
2251 assert(type.floating);
2252 assert(lp_check_value(type, a));
2253
2254 res = a;
2255 if (type.sign) {
2256 if (arch_rounding_available(type)) {
2257 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2258 }
2259 else {
2260 struct lp_type inttype;
2261 struct lp_build_context intbld;
2262 LLVMValueRef trunc, itrunc, mask;
2263
2264 assert(type.floating);
2265 assert(lp_check_value(type, a));
2266
2267 inttype = type;
2268 inttype.floating = 0;
2269 lp_build_context_init(&intbld, bld->gallivm, inttype);
2270
2271 /* round by truncation */
2272 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2273 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2274
2275 /*
2276 * fix values if rounding is wrong (for non-special cases)
2277 * - this is the case if trunc > a
2278 * The results of doing this with NaNs, very large values etc.
2279 * are undefined but this seems to be the case anyway.
2280 */
2281 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2282 /* cheapie minus one with mask since the mask is minus one / zero */
2283 return lp_build_add(&intbld, itrunc, mask);
2284 }
2285 }
2286
2287 /* round to nearest (toward zero) */
2288 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2289
2290 return res;
2291 }
2292
2293
2294 /**
2295 * Return ceiling of float (vector), returning int (vector).
2296 * Ex: iceil( 1.1) = 2
2297 * Ex: iceil(-1.1) = -1
2298 */
2299 LLVMValueRef
2300 lp_build_iceil(struct lp_build_context *bld,
2301 LLVMValueRef a)
2302 {
2303 LLVMBuilderRef builder = bld->gallivm->builder;
2304 const struct lp_type type = bld->type;
2305 LLVMTypeRef int_vec_type = bld->int_vec_type;
2306 LLVMValueRef res;
2307
2308 assert(type.floating);
2309 assert(lp_check_value(type, a));
2310
2311 if (arch_rounding_available(type)) {
2312 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2313 }
2314 else {
2315 struct lp_type inttype;
2316 struct lp_build_context intbld;
2317 LLVMValueRef trunc, itrunc, mask;
2318
2319 assert(type.floating);
2320 assert(lp_check_value(type, a));
2321
2322 inttype = type;
2323 inttype.floating = 0;
2324 lp_build_context_init(&intbld, bld->gallivm, inttype);
2325
2326 /* round by truncation */
2327 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2328 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2329
2330 /*
2331 * fix values if rounding is wrong (for non-special cases)
2332 * - this is the case if trunc < a
2333 * The results of doing this with NaNs, very large values etc.
2334 * are undefined but this seems to be the case anyway.
2335 */
2336 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2337 /* cheapie plus one with mask since the mask is minus one / zero */
2338 return lp_build_sub(&intbld, itrunc, mask);
2339 }
2340
2341 /* round to nearest (toward zero) */
2342 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2343
2344 return res;
2345 }
2346
2347
2348 /**
2349 * Combined ifloor() & fract().
2350 *
2351 * Preferred to calling the functions separately, as it will ensure that the
2352 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2353 */
2354 void
2355 lp_build_ifloor_fract(struct lp_build_context *bld,
2356 LLVMValueRef a,
2357 LLVMValueRef *out_ipart,
2358 LLVMValueRef *out_fpart)
2359 {
2360 LLVMBuilderRef builder = bld->gallivm->builder;
2361 const struct lp_type type = bld->type;
2362 LLVMValueRef ipart;
2363
2364 assert(type.floating);
2365 assert(lp_check_value(type, a));
2366
2367 if (arch_rounding_available(type)) {
2368 /*
2369 * floor() is easier.
2370 */
2371
2372 ipart = lp_build_floor(bld, a);
2373 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2374 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2375 }
2376 else {
2377 /*
2378 * ifloor() is easier.
2379 */
2380
2381 *out_ipart = lp_build_ifloor(bld, a);
2382 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2383 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2384 }
2385 }
2386
2387
2388 /**
2389 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2390 * always smaller than one.
2391 */
2392 void
2393 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2394 LLVMValueRef a,
2395 LLVMValueRef *out_ipart,
2396 LLVMValueRef *out_fpart)
2397 {
2398 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2399 *out_fpart = clamp_fract(bld, *out_fpart);
2400 }
2401
2402
2403 LLVMValueRef
2404 lp_build_sqrt(struct lp_build_context *bld,
2405 LLVMValueRef a)
2406 {
2407 LLVMBuilderRef builder = bld->gallivm->builder;
2408 const struct lp_type type = bld->type;
2409 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2410 char intrinsic[32];
2411
2412 assert(lp_check_value(type, a));
2413
2414 /* TODO: optimize the constant case */
2415
2416 assert(type.floating);
2417 if (type.length == 1) {
2418 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2419 }
2420 else {
2421 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2422 }
2423
2424 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2425 }
2426
2427
2428 /**
2429 * Do one Newton-Raphson step to improve reciprocate precision:
2430 *
2431 * x_{i+1} = x_i * (2 - a * x_i)
2432 *
2433 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2434 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2435 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2436 * halo. It would be necessary to clamp the argument to prevent this.
2437 *
2438 * See also:
2439 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2440 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2441 */
2442 static inline LLVMValueRef
2443 lp_build_rcp_refine(struct lp_build_context *bld,
2444 LLVMValueRef a,
2445 LLVMValueRef rcp_a)
2446 {
2447 LLVMBuilderRef builder = bld->gallivm->builder;
2448 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2449 LLVMValueRef res;
2450
2451 res = LLVMBuildFMul(builder, a, rcp_a, "");
2452 res = LLVMBuildFSub(builder, two, res, "");
2453 res = LLVMBuildFMul(builder, rcp_a, res, "");
2454
2455 return res;
2456 }
2457
2458
2459 LLVMValueRef
2460 lp_build_rcp(struct lp_build_context *bld,
2461 LLVMValueRef a)
2462 {
2463 LLVMBuilderRef builder = bld->gallivm->builder;
2464 const struct lp_type type = bld->type;
2465
2466 assert(lp_check_value(type, a));
2467
2468 if(a == bld->zero)
2469 return bld->undef;
2470 if(a == bld->one)
2471 return bld->one;
2472 if(a == bld->undef)
2473 return bld->undef;
2474
2475 assert(type.floating);
2476
2477 if(LLVMIsConstant(a))
2478 return LLVMConstFDiv(bld->one, a);
2479
2480 /*
2481 * We don't use RCPPS because:
2482 * - it only has 10bits of precision
2483 * - it doesn't even get the reciprocate of 1.0 exactly
2484 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2485 * - for recent processors the benefit over DIVPS is marginal, a case
2486 * dependent
2487 *
2488 * We could still use it on certain processors if benchmarks show that the
2489 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2490 * particular uses that require less workarounds.
2491 */
2492
2493 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2494 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2495 const unsigned num_iterations = 0;
2496 LLVMValueRef res;
2497 unsigned i;
2498 const char *intrinsic = NULL;
2499
2500 if (type.length == 4) {
2501 intrinsic = "llvm.x86.sse.rcp.ps";
2502 }
2503 else {
2504 intrinsic = "llvm.x86.avx.rcp.ps.256";
2505 }
2506
2507 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2508
2509 for (i = 0; i < num_iterations; ++i) {
2510 res = lp_build_rcp_refine(bld, a, res);
2511 }
2512
2513 return res;
2514 }
2515
2516 return LLVMBuildFDiv(builder, bld->one, a, "");
2517 }
2518
2519
2520 /**
2521 * Do one Newton-Raphson step to improve rsqrt precision:
2522 *
2523 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2524 *
2525 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2526 */
2527 static inline LLVMValueRef
2528 lp_build_rsqrt_refine(struct lp_build_context *bld,
2529 LLVMValueRef a,
2530 LLVMValueRef rsqrt_a)
2531 {
2532 LLVMBuilderRef builder = bld->gallivm->builder;
2533 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2534 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2535 LLVMValueRef res;
2536
2537 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2538 res = LLVMBuildFMul(builder, a, res, "");
2539 res = LLVMBuildFSub(builder, three, res, "");
2540 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2541 res = LLVMBuildFMul(builder, half, res, "");
2542
2543 return res;
2544 }
2545
2546
2547 /**
2548 * Generate 1/sqrt(a).
2549 * Result is undefined for values < 0, infinity for +0.
2550 */
2551 LLVMValueRef
2552 lp_build_rsqrt(struct lp_build_context *bld,
2553 LLVMValueRef a)
2554 {
2555 const struct lp_type type = bld->type;
2556
2557 assert(lp_check_value(type, a));
2558
2559 assert(type.floating);
2560
2561 /*
2562 * This should be faster but all denormals will end up as infinity.
2563 */
2564 if (0 && lp_build_fast_rsqrt_available(type)) {
2565 const unsigned num_iterations = 1;
2566 LLVMValueRef res;
2567 unsigned i;
2568
2569 /* rsqrt(1.0) != 1.0 here */
2570 res = lp_build_fast_rsqrt(bld, a);
2571
2572 if (num_iterations) {
2573 /*
2574 * Newton-Raphson will result in NaN instead of infinity for zero,
2575 * and NaN instead of zero for infinity.
2576 * Also, need to ensure rsqrt(1.0) == 1.0.
2577 * All numbers smaller than FLT_MIN will result in +infinity
2578 * (rsqrtps treats all denormals as zero).
2579 */
2580 LLVMValueRef cmp;
2581 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2582 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2583
2584 for (i = 0; i < num_iterations; ++i) {
2585 res = lp_build_rsqrt_refine(bld, a, res);
2586 }
2587 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2588 res = lp_build_select(bld, cmp, inf, res);
2589 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2590 res = lp_build_select(bld, cmp, bld->zero, res);
2591 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2592 res = lp_build_select(bld, cmp, bld->one, res);
2593 }
2594
2595 return res;
2596 }
2597
2598 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2599 }
2600
2601 /**
2602 * If there's a fast (inaccurate) rsqrt instruction available
2603 * (caller may want to avoid to call rsqrt_fast if it's not available,
2604 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2605 * unavailable it would result in sqrt/div/mul so obviously
2606 * much better to just call sqrt, skipping both div and mul).
2607 */
2608 boolean
2609 lp_build_fast_rsqrt_available(struct lp_type type)
2610 {
2611 assert(type.floating);
2612
2613 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2614 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2615 return true;
2616 }
2617 return false;
2618 }
2619
2620
2621 /**
2622 * Generate 1/sqrt(a).
2623 * Result is undefined for values < 0, infinity for +0.
2624 * Precision is limited, only ~10 bits guaranteed
2625 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2626 */
2627 LLVMValueRef
2628 lp_build_fast_rsqrt(struct lp_build_context *bld,
2629 LLVMValueRef a)
2630 {
2631 LLVMBuilderRef builder = bld->gallivm->builder;
2632 const struct lp_type type = bld->type;
2633
2634 assert(lp_check_value(type, a));
2635
2636 if (lp_build_fast_rsqrt_available(type)) {
2637 const char *intrinsic = NULL;
2638
2639 if (type.length == 4) {
2640 intrinsic = "llvm.x86.sse.rsqrt.ps";
2641 }
2642 else {
2643 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2644 }
2645 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2646 }
2647 else {
2648 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2649 }
2650 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2651 }
2652
2653
2654 /**
2655 * Generate sin(a) or cos(a) using polynomial approximation.
2656 * TODO: it might be worth recognizing sin and cos using same source
2657 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2658 * would be way cheaper than calculating (nearly) everything twice...
2659 * Not sure it's common enough to be worth bothering however, scs
2660 * opcode could also benefit from calculating both though.
2661 */
2662 static LLVMValueRef
2663 lp_build_sin_or_cos(struct lp_build_context *bld,
2664 LLVMValueRef a,
2665 boolean cos)
2666 {
2667 struct gallivm_state *gallivm = bld->gallivm;
2668 LLVMBuilderRef b = gallivm->builder;
2669 struct lp_type int_type = lp_int_type(bld->type);
2670
2671 /*
2672 * take the absolute value,
2673 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2674 */
2675
2676 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2677 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2678
2679 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2680 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2681
2682 /*
2683 * scale by 4/Pi
2684 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2685 */
2686
2687 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2688 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2689
2690 /*
2691 * store the integer part of y in mm0
2692 * emm2 = _mm_cvttps_epi32(y);
2693 */
2694
2695 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2696
2697 /*
2698 * j=(j+1) & (~1) (see the cephes sources)
2699 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2700 */
2701
2702 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2703 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2704 /*
2705 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2706 */
2707 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2708 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2709
2710 /*
2711 * y = _mm_cvtepi32_ps(emm2);
2712 */
2713 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2714
2715 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2716 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2717 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2718 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2719
2720 /*
2721 * Argument used for poly selection and sign bit determination
2722 * is different for sin vs. cos.
2723 */
2724 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2725 emm2_and;
2726
2727 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2728 LLVMBuildNot(b, emm2_2, ""), ""),
2729 const_29, "sign_bit") :
2730 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2731 LLVMBuildShl(b, emm2_add,
2732 const_29, ""), ""),
2733 sign_mask, "sign_bit");
2734
2735 /*
2736 * get the polynom selection mask
2737 * there is one polynom for 0 <= x <= Pi/4
2738 * and another one for Pi/4<x<=Pi/2
2739 * Both branches will be computed.
2740 *
2741 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2742 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2743 */
2744
2745 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2746 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2747 int_type, PIPE_FUNC_EQUAL,
2748 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2749
2750 /*
2751 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2752 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2753 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2754 */
2755 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2756 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2757 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2758
2759 /*
2760 * The magic pass: "Extended precision modular arithmetic"
2761 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2762 * xmm1 = _mm_mul_ps(y, xmm1);
2763 * xmm2 = _mm_mul_ps(y, xmm2);
2764 * xmm3 = _mm_mul_ps(y, xmm3);
2765 */
2766 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2767 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2768 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2769
2770 /*
2771 * x = _mm_add_ps(x, xmm1);
2772 * x = _mm_add_ps(x, xmm2);
2773 * x = _mm_add_ps(x, xmm3);
2774 */
2775
2776 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2777 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2778 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2779
2780 /*
2781 * Evaluate the first polynom (0 <= x <= Pi/4)
2782 *
2783 * z = _mm_mul_ps(x,x);
2784 */
2785 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2786
2787 /*
2788 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2789 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2790 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2791 */
2792 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2793 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2794 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2795
2796 /*
2797 * y = *(v4sf*)_ps_coscof_p0;
2798 * y = _mm_mul_ps(y, z);
2799 */
2800 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2801 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2802 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2803 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2804 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2805 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2806
2807
2808 /*
2809 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2810 * y = _mm_sub_ps(y, tmp);
2811 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2812 */
2813 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2814 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2815 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2816 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2817 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2818
2819 /*
2820 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2821 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2822 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2823 */
2824 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2825 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2826 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2827
2828 /*
2829 * Evaluate the second polynom (Pi/4 <= x <= 0)
2830 *
2831 * y2 = *(v4sf*)_ps_sincof_p0;
2832 * y2 = _mm_mul_ps(y2, z);
2833 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2834 * y2 = _mm_mul_ps(y2, z);
2835 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2836 * y2 = _mm_mul_ps(y2, z);
2837 * y2 = _mm_mul_ps(y2, x);
2838 * y2 = _mm_add_ps(y2, x);
2839 */
2840
2841 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2842 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2843 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2844 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2845 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2846 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2847 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2848
2849 /*
2850 * select the correct result from the two polynoms
2851 * xmm3 = poly_mask;
2852 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2853 * y = _mm_andnot_ps(xmm3, y);
2854 * y = _mm_or_ps(y,y2);
2855 */
2856 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2857 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2858 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2859 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2860 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2861 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2862
2863 /*
2864 * update the sign
2865 * y = _mm_xor_ps(y, sign_bit);
2866 */
2867 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2868 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2869
2870 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2871
2872 /* clamp output to be within [-1, 1] */
2873 y_result = lp_build_clamp(bld, y_result,
2874 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2875 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2876 /* If a is -inf, inf or NaN then return NaN */
2877 y_result = lp_build_select(bld, isfinite, y_result,
2878 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2879 return y_result;
2880 }
2881
2882
2883 /**
2884 * Generate sin(a)
2885 */
2886 LLVMValueRef
2887 lp_build_sin(struct lp_build_context *bld,
2888 LLVMValueRef a)
2889 {
2890 return lp_build_sin_or_cos(bld, a, FALSE);
2891 }
2892
2893
2894 /**
2895 * Generate cos(a)
2896 */
2897 LLVMValueRef
2898 lp_build_cos(struct lp_build_context *bld,
2899 LLVMValueRef a)
2900 {
2901 return lp_build_sin_or_cos(bld, a, TRUE);
2902 }
2903
2904
2905 /**
2906 * Generate pow(x, y)
2907 */
2908 LLVMValueRef
2909 lp_build_pow(struct lp_build_context *bld,
2910 LLVMValueRef x,
2911 LLVMValueRef y)
2912 {
2913 /* TODO: optimize the constant case */
2914 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2915 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2916 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2917 __FUNCTION__);
2918 }
2919
2920 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2921 }
2922
2923
2924 /**
2925 * Generate exp(x)
2926 */
2927 LLVMValueRef
2928 lp_build_exp(struct lp_build_context *bld,
2929 LLVMValueRef x)
2930 {
2931 /* log2(e) = 1/log(2) */
2932 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2933 1.4426950408889634);
2934
2935 assert(lp_check_value(bld->type, x));
2936
2937 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2938 }
2939
2940
2941 /**
2942 * Generate log(x)
2943 * Behavior is undefined with infs, 0s and nans
2944 */
2945 LLVMValueRef
2946 lp_build_log(struct lp_build_context *bld,
2947 LLVMValueRef x)
2948 {
2949 /* log(2) */
2950 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2951 0.69314718055994529);
2952
2953 assert(lp_check_value(bld->type, x));
2954
2955 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2956 }
2957
2958 /**
2959 * Generate log(x) that handles edge cases (infs, 0s and nans)
2960 */
2961 LLVMValueRef
2962 lp_build_log_safe(struct lp_build_context *bld,
2963 LLVMValueRef x)
2964 {
2965 /* log(2) */
2966 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2967 0.69314718055994529);
2968
2969 assert(lp_check_value(bld->type, x));
2970
2971 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2972 }
2973
2974
2975 /**
2976 * Generate polynomial.
2977 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2978 */
2979 LLVMValueRef
2980 lp_build_polynomial(struct lp_build_context *bld,
2981 LLVMValueRef x,
2982 const double *coeffs,
2983 unsigned num_coeffs)
2984 {
2985 const struct lp_type type = bld->type;
2986 LLVMValueRef even = NULL, odd = NULL;
2987 LLVMValueRef x2;
2988 unsigned i;
2989
2990 assert(lp_check_value(bld->type, x));
2991
2992 /* TODO: optimize the constant case */
2993 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2994 LLVMIsConstant(x)) {
2995 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2996 __FUNCTION__);
2997 }
2998
2999 /*
3000 * Calculate odd and even terms seperately to decrease data dependency
3001 * Ex:
3002 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3003 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3004 */
3005 x2 = lp_build_mul(bld, x, x);
3006
3007 for (i = num_coeffs; i--; ) {
3008 LLVMValueRef coeff;
3009
3010 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3011
3012 if (i % 2 == 0) {
3013 if (even)
3014 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
3015 else
3016 even = coeff;
3017 } else {
3018 if (odd)
3019 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
3020 else
3021 odd = coeff;
3022 }
3023 }
3024
3025 if (odd)
3026 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
3027 else if (even)
3028 return even;
3029 else
3030 return bld->undef;
3031 }
3032
3033
3034 /**
3035 * Minimax polynomial fit of 2**x, in range [0, 1[
3036 */
3037 const double lp_build_exp2_polynomial[] = {
3038 #if EXP_POLY_DEGREE == 5
3039 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3040 0.693153073200168932794,
3041 0.240153617044375388211,
3042 0.0558263180532956664775,
3043 0.00898934009049466391101,
3044 0.00187757667519147912699
3045 #elif EXP_POLY_DEGREE == 4
3046 1.00000259337069434683,
3047 0.693003834469974940458,
3048 0.24144275689150793076,
3049 0.0520114606103070150235,
3050 0.0135341679161270268764
3051 #elif EXP_POLY_DEGREE == 3
3052 0.999925218562710312959,
3053 0.695833540494823811697,
3054 0.226067155427249155588,
3055 0.0780245226406372992967
3056 #elif EXP_POLY_DEGREE == 2
3057 1.00172476321474503578,
3058 0.657636275736077639316,
3059 0.33718943461968720704
3060 #else
3061 #error
3062 #endif
3063 };
3064
3065
3066 LLVMValueRef
3067 lp_build_exp2(struct lp_build_context *bld,
3068 LLVMValueRef x)
3069 {
3070 LLVMBuilderRef builder = bld->gallivm->builder;
3071 const struct lp_type type = bld->type;
3072 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3073 LLVMValueRef ipart = NULL;
3074 LLVMValueRef fpart = NULL;
3075 LLVMValueRef expipart = NULL;
3076 LLVMValueRef expfpart = NULL;
3077 LLVMValueRef res = NULL;
3078
3079 assert(lp_check_value(bld->type, x));
3080
3081 /* TODO: optimize the constant case */
3082 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3083 LLVMIsConstant(x)) {
3084 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3085 __FUNCTION__);
3086 }
3087
3088 assert(type.floating && type.width == 32);
3089
3090 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3091 * the result is INF and if it's smaller than -126.9 the result is 0 */
3092 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3093 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3094 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3095 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3096
3097 /* ipart = floor(x) */
3098 /* fpart = x - ipart */
3099 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3100
3101 /* expipart = (float) (1 << ipart) */
3102 expipart = LLVMBuildAdd(builder, ipart,
3103 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3104 expipart = LLVMBuildShl(builder, expipart,
3105 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3106 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3107
3108 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3109 Elements(lp_build_exp2_polynomial));
3110
3111 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3112
3113 return res;
3114 }
3115
3116
3117
3118 /**
3119 * Extract the exponent of a IEEE-754 floating point value.
3120 *
3121 * Optionally apply an integer bias.
3122 *
3123 * Result is an integer value with
3124 *
3125 * ifloor(log2(x)) + bias
3126 */
3127 LLVMValueRef
3128 lp_build_extract_exponent(struct lp_build_context *bld,
3129 LLVMValueRef x,
3130 int bias)
3131 {
3132 LLVMBuilderRef builder = bld->gallivm->builder;
3133 const struct lp_type type = bld->type;
3134 unsigned mantissa = lp_mantissa(type);
3135 LLVMValueRef res;
3136
3137 assert(type.floating);
3138
3139 assert(lp_check_value(bld->type, x));
3140
3141 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3142
3143 res = LLVMBuildLShr(builder, x,
3144 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3145 res = LLVMBuildAnd(builder, res,
3146 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3147 res = LLVMBuildSub(builder, res,
3148 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3149
3150 return res;
3151 }
3152
3153
3154 /**
3155 * Extract the mantissa of the a floating.
3156 *
3157 * Result is a floating point value with
3158 *
3159 * x / floor(log2(x))
3160 */
3161 LLVMValueRef
3162 lp_build_extract_mantissa(struct lp_build_context *bld,
3163 LLVMValueRef x)
3164 {
3165 LLVMBuilderRef builder = bld->gallivm->builder;
3166 const struct lp_type type = bld->type;
3167 unsigned mantissa = lp_mantissa(type);
3168 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3169 (1ULL << mantissa) - 1);
3170 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3171 LLVMValueRef res;
3172
3173 assert(lp_check_value(bld->type, x));
3174
3175 assert(type.floating);
3176
3177 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3178
3179 /* res = x / 2**ipart */
3180 res = LLVMBuildAnd(builder, x, mantmask, "");
3181 res = LLVMBuildOr(builder, res, one, "");
3182 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3183
3184 return res;
3185 }
3186
3187
3188
3189 /**
3190 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3191 * These coefficients can be generate with
3192 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3193 */
3194 const double lp_build_log2_polynomial[] = {
3195 #if LOG_POLY_DEGREE == 5
3196 2.88539008148777786488L,
3197 0.961796878841293367824L,
3198 0.577058946784739859012L,
3199 0.412914355135828735411L,
3200 0.308591899232910175289L,
3201 0.352376952300281371868L,
3202 #elif LOG_POLY_DEGREE == 4
3203 2.88539009343309178325L,
3204 0.961791550404184197881L,
3205 0.577440339438736392009L,
3206 0.403343858251329912514L,
3207 0.406718052498846252698L,
3208 #elif LOG_POLY_DEGREE == 3
3209 2.88538959748872753838L,
3210 0.961932915889597772928L,
3211 0.571118517972136195241L,
3212 0.493997535084709500285L,
3213 #else
3214 #error
3215 #endif
3216 };
3217
3218 /**
3219 * See http://www.devmaster.net/forums/showthread.php?p=43580
3220 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3221 * http://www.nezumi.demon.co.uk/consult/logx.htm
3222 *
3223 * If handle_edge_cases is true the function will perform computations
3224 * to match the required D3D10+ behavior for each of the edge cases.
3225 * That means that if input is:
3226 * - less than zero (to and including -inf) then NaN will be returned
3227 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3228 * - +infinity, then +infinity will be returned
3229 * - NaN, then NaN will be returned
3230 *
3231 * Those checks are fairly expensive so if you don't need them make sure
3232 * handle_edge_cases is false.
3233 */
3234 void
3235 lp_build_log2_approx(struct lp_build_context *bld,
3236 LLVMValueRef x,
3237 LLVMValueRef *p_exp,
3238 LLVMValueRef *p_floor_log2,
3239 LLVMValueRef *p_log2,
3240 boolean handle_edge_cases)
3241 {
3242 LLVMBuilderRef builder = bld->gallivm->builder;
3243 const struct lp_type type = bld->type;
3244 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3245 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3246
3247 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3248 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3249 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3250
3251 LLVMValueRef i = NULL;
3252 LLVMValueRef y = NULL;
3253 LLVMValueRef z = NULL;
3254 LLVMValueRef exp = NULL;
3255 LLVMValueRef mant = NULL;
3256 LLVMValueRef logexp = NULL;
3257 LLVMValueRef logmant = NULL;
3258 LLVMValueRef res = NULL;
3259
3260 assert(lp_check_value(bld->type, x));
3261
3262 if(p_exp || p_floor_log2 || p_log2) {
3263 /* TODO: optimize the constant case */
3264 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3265 LLVMIsConstant(x)) {
3266 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3267 __FUNCTION__);
3268 }
3269
3270 assert(type.floating && type.width == 32);
3271
3272 /*
3273 * We don't explicitly handle denormalized numbers. They will yield a
3274 * result in the neighbourhood of -127, which appears to be adequate
3275 * enough.
3276 */
3277
3278 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3279
3280 /* exp = (float) exponent(x) */
3281 exp = LLVMBuildAnd(builder, i, expmask, "");
3282 }
3283
3284 if(p_floor_log2 || p_log2) {
3285 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3286 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3287 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3288 }
3289
3290 if(p_log2) {
3291 /* mant = 1 + (float) mantissa(x) */
3292 mant = LLVMBuildAnd(builder, i, mantmask, "");
3293 mant = LLVMBuildOr(builder, mant, one, "");
3294 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3295
3296 /* y = (mant - 1) / (mant + 1) */
3297 y = lp_build_div(bld,
3298 lp_build_sub(bld, mant, bld->one),
3299 lp_build_add(bld, mant, bld->one)
3300 );
3301
3302 /* z = y^2 */
3303 z = lp_build_mul(bld, y, y);
3304
3305 /* compute P(z) */
3306 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3307 Elements(lp_build_log2_polynomial));
3308
3309 /* logmant = y * P(z) */
3310 logmant = lp_build_mul(bld, y, logmant);
3311
3312 res = lp_build_add(bld, logmant, logexp);
3313
3314 if (type.floating && handle_edge_cases) {
3315 LLVMValueRef negmask, infmask, zmask;
3316 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3317 lp_build_const_vec(bld->gallivm, type, 0.0f));
3318 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3319 lp_build_const_vec(bld->gallivm, type, 0.0f));
3320 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3321 lp_build_const_vec(bld->gallivm, type, INFINITY));
3322
3323 /* If x is qual to inf make sure we return inf */
3324 res = lp_build_select(bld, infmask,
3325 lp_build_const_vec(bld->gallivm, type, INFINITY),
3326 res);
3327 /* If x is qual to 0, return -inf */
3328 res = lp_build_select(bld, zmask,
3329 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3330 res);
3331 /* If x is nan or less than 0, return nan */
3332 res = lp_build_select(bld, negmask,
3333 lp_build_const_vec(bld->gallivm, type, NAN),
3334 res);
3335 }
3336 }
3337
3338 if(p_exp) {
3339 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3340 *p_exp = exp;
3341 }
3342
3343 if(p_floor_log2)
3344 *p_floor_log2 = logexp;
3345
3346 if(p_log2)
3347 *p_log2 = res;
3348 }
3349
3350
3351 /*
3352 * log2 implementation which doesn't have special code to
3353 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3354 * the results for those cases are undefined.
3355 */
3356 LLVMValueRef
3357 lp_build_log2(struct lp_build_context *bld,
3358 LLVMValueRef x)
3359 {
3360 LLVMValueRef res;
3361 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3362 return res;
3363 }
3364
3365 /*
3366 * Version of log2 which handles all edge cases.
3367 * Look at documentation of lp_build_log2_approx for
3368 * description of the behavior for each of the edge cases.
3369 */
3370 LLVMValueRef
3371 lp_build_log2_safe(struct lp_build_context *bld,
3372 LLVMValueRef x)
3373 {
3374 LLVMValueRef res;
3375 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3376 return res;
3377 }
3378
3379
3380 /**
3381 * Faster (and less accurate) log2.
3382 *
3383 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3384 *
3385 * Piece-wise linear approximation, with exact results when x is a
3386 * power of two.
3387 *
3388 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3389 */
3390 LLVMValueRef
3391 lp_build_fast_log2(struct lp_build_context *bld,
3392 LLVMValueRef x)
3393 {
3394 LLVMBuilderRef builder = bld->gallivm->builder;
3395 LLVMValueRef ipart;
3396 LLVMValueRef fpart;
3397
3398 assert(lp_check_value(bld->type, x));
3399
3400 assert(bld->type.floating);
3401
3402 /* ipart = floor(log2(x)) - 1 */
3403 ipart = lp_build_extract_exponent(bld, x, -1);
3404 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3405
3406 /* fpart = x / 2**ipart */
3407 fpart = lp_build_extract_mantissa(bld, x);
3408
3409 /* ipart + fpart */
3410 return LLVMBuildFAdd(builder, ipart, fpart, "");
3411 }
3412
3413
3414 /**
3415 * Fast implementation of iround(log2(x)).
3416 *
3417 * Not an approximation -- it should give accurate results all the time.
3418 */
3419 LLVMValueRef
3420 lp_build_ilog2(struct lp_build_context *bld,
3421 LLVMValueRef x)
3422 {
3423 LLVMBuilderRef builder = bld->gallivm->builder;
3424 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3425 LLVMValueRef ipart;
3426
3427 assert(bld->type.floating);
3428
3429 assert(lp_check_value(bld->type, x));
3430
3431 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3432 x = LLVMBuildFMul(builder, x, sqrt2, "");
3433
3434 /* ipart = floor(log2(x) + 0.5) */
3435 ipart = lp_build_extract_exponent(bld, x, 0);
3436
3437 return ipart;
3438 }
3439
3440 LLVMValueRef
3441 lp_build_mod(struct lp_build_context *bld,
3442 LLVMValueRef x,
3443 LLVMValueRef y)
3444 {
3445 LLVMBuilderRef builder = bld->gallivm->builder;
3446 LLVMValueRef res;
3447 const struct lp_type type = bld->type;
3448
3449 assert(lp_check_value(type, x));
3450 assert(lp_check_value(type, y));
3451
3452 if (type.floating)
3453 res = LLVMBuildFRem(builder, x, y, "");
3454 else if (type.sign)
3455 res = LLVMBuildSRem(builder, x, y, "");
3456 else
3457 res = LLVMBuildURem(builder, x, y, "");
3458 return res;
3459 }
3460
3461
3462 /*
3463 * For floating inputs it creates and returns a mask
3464 * which is all 1's for channels which are NaN.
3465 * Channels inside x which are not NaN will be 0.
3466 */
3467 LLVMValueRef
3468 lp_build_isnan(struct lp_build_context *bld,
3469 LLVMValueRef x)
3470 {
3471 LLVMValueRef mask;
3472 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3473
3474 assert(bld->type.floating);
3475 assert(lp_check_value(bld->type, x));
3476
3477 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3478 "isnotnan");
3479 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3480 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3481 return mask;
3482 }
3483
3484 /* Returns all 1's for floating point numbers that are
3485 * finite numbers and returns all zeros for -inf,
3486 * inf and nan's */
3487 LLVMValueRef
3488 lp_build_isfinite(struct lp_build_context *bld,
3489 LLVMValueRef x)
3490 {
3491 LLVMBuilderRef builder = bld->gallivm->builder;
3492 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3493 struct lp_type int_type = lp_int_type(bld->type);
3494 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3495 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3496 0x7f800000);
3497
3498 if (!bld->type.floating) {
3499 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3500 }
3501 assert(bld->type.floating);
3502 assert(lp_check_value(bld->type, x));
3503 assert(bld->type.width == 32);
3504
3505 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3506 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3507 intx, infornan32);
3508 }
3509
3510 /*
3511 * Returns true if the number is nan or inf and false otherwise.
3512 * The input has to be a floating point vector.
3513 */
3514 LLVMValueRef
3515 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3516 const struct lp_type type,
3517 LLVMValueRef x)
3518 {
3519 LLVMBuilderRef builder = gallivm->builder;
3520 struct lp_type int_type = lp_int_type(type);
3521 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3522 0x7f800000);
3523 LLVMValueRef ret;
3524
3525 assert(type.floating);
3526
3527 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3528 ret = LLVMBuildAnd(builder, ret, const0, "");
3529 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3530 ret, const0);
3531
3532 return ret;
3533 }
3534
3535
3536 LLVMValueRef
3537 lp_build_fpstate_get(struct gallivm_state *gallivm)
3538 {
3539 if (util_cpu_caps.has_sse) {
3540 LLVMBuilderRef builder = gallivm->builder;
3541 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3542 gallivm,
3543 LLVMInt32TypeInContext(gallivm->context),
3544 "mxcsr_ptr");
3545 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3546 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3547 lp_build_intrinsic(builder,
3548 "llvm.x86.sse.stmxcsr",
3549 LLVMVoidTypeInContext(gallivm->context),
3550 &mxcsr_ptr8, 1, 0);
3551 return mxcsr_ptr;
3552 }
3553 return 0;
3554 }
3555
3556 void
3557 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3558 boolean zero)
3559 {
3560 if (util_cpu_caps.has_sse) {
3561 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3562 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3563
3564 LLVMBuilderRef builder = gallivm->builder;
3565 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3566 LLVMValueRef mxcsr =
3567 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3568
3569 if (util_cpu_caps.has_daz) {
3570 /* Enable denormals are zero mode */
3571 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3572 }
3573 if (zero) {
3574 mxcsr = LLVMBuildOr(builder, mxcsr,
3575 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3576 } else {
3577 mxcsr = LLVMBuildAnd(builder, mxcsr,
3578 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3579 }
3580
3581 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3582 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3583 }
3584 }
3585
3586 void
3587 lp_build_fpstate_set(struct gallivm_state *gallivm,
3588 LLVMValueRef mxcsr_ptr)
3589 {
3590 if (util_cpu_caps.has_sse) {
3591 LLVMBuilderRef builder = gallivm->builder;
3592 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3593 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3594 lp_build_intrinsic(builder,
3595 "llvm.x86.sse.ldmxcsr",
3596 LLVMVoidTypeInContext(gallivm->context),
3597 &mxcsr_ptr, 1, 0);
3598 }
3599 }