gallivm: eliminate a unnecessary AND with unorm lerps
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
69 #endif
70
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
73 #endif
74
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
77 #endif
78
79 #define EXP_POLY_DEGREE 5
80
81 #define LOG_POLY_DEGREE 4
82
83
84 /**
85 * Generate min(a, b)
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
89 */
90 static LLVMValueRef
91 lp_build_min_simple(struct lp_build_context *bld,
92 LLVMValueRef a,
93 LLVMValueRef b,
94 enum gallivm_nan_behavior nan_behavior)
95 {
96 const struct lp_type type = bld->type;
97 const char *intrinsic = NULL;
98 unsigned intr_size = 0;
99 LLVMValueRef cond;
100
101 assert(lp_check_value(type, a));
102 assert(lp_check_value(type, b));
103
104 /* TODO: optimize the constant case */
105
106 if (type.floating && util_cpu_caps.has_sse) {
107 if (type.width == 32) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse.min.ss";
110 intr_size = 128;
111 }
112 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse.min.ps";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.ps.256";
118 intr_size = 256;
119 }
120 }
121 if (type.width == 64 && util_cpu_caps.has_sse2) {
122 if (type.length == 1) {
123 intrinsic = "llvm.x86.sse2.min.sd";
124 intr_size = 128;
125 }
126 else if (type.length == 2 || !util_cpu_caps.has_avx) {
127 intrinsic = "llvm.x86.sse2.min.pd";
128 intr_size = 128;
129 }
130 else {
131 intrinsic = "llvm.x86.avx.min.pd.256";
132 intr_size = 256;
133 }
134 }
135 }
136 else if (type.floating && util_cpu_caps.has_altivec) {
137 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
138 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
147 intr_size = 128;
148 if ((type.width == 8 || type.width == 16) &&
149 (type.width * type.length <= 64) &&
150 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
151 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
152 __FUNCTION__);
153 }
154 if (type.width == 8 && !type.sign) {
155 intrinsic = "llvm.x86.sse2.pminu.b";
156 }
157 else if (type.width == 16 && type.sign) {
158 intrinsic = "llvm.x86.sse2.pmins.w";
159 }
160 if (util_cpu_caps.has_sse4_1) {
161 if (type.width == 8 && type.sign) {
162 intrinsic = "llvm.x86.sse41.pminsb";
163 }
164 if (type.width == 16 && !type.sign) {
165 intrinsic = "llvm.x86.sse41.pminuw";
166 }
167 if (type.width == 32 && !type.sign) {
168 intrinsic = "llvm.x86.sse41.pminud";
169 }
170 if (type.width == 32 && type.sign) {
171 intrinsic = "llvm.x86.sse41.pminsd";
172 }
173 }
174 } else if (util_cpu_caps.has_altivec) {
175 intr_size = 128;
176 if (type.width == 8) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminub";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsb";
181 }
182 } else if (type.width == 16) {
183 if (!type.sign) {
184 intrinsic = "llvm.ppc.altivec.vminuh";
185 } else {
186 intrinsic = "llvm.ppc.altivec.vminsh";
187 }
188 } else if (type.width == 32) {
189 if (!type.sign) {
190 intrinsic = "llvm.ppc.altivec.vminuw";
191 } else {
192 intrinsic = "llvm.ppc.altivec.vminsw";
193 }
194 }
195 }
196
197 if (intrinsic) {
198 /* We need to handle nan's for floating point numbers. If one of the
199 * inputs is nan the other should be returned (required by both D3D10+
200 * and OpenCL).
201 * The sse intrinsics return the second operator in case of nan by
202 * default so we need to special code to handle those.
203 */
204 if (util_cpu_caps.has_sse && type.floating &&
205 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
206 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
207 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
208 LLVMValueRef isnan, min;
209 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
210 type,
211 intr_size, a, b);
212 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
213 isnan = lp_build_isnan(bld, b);
214 return lp_build_select(bld, isnan, a, min);
215 } else {
216 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
217 isnan = lp_build_isnan(bld, a);
218 return lp_build_select(bld, isnan, a, min);
219 }
220 } else {
221 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
222 type,
223 intr_size, a, b);
224 }
225 }
226
227 if (type.floating) {
228 switch (nan_behavior) {
229 case GALLIVM_NAN_RETURN_NAN: {
230 LLVMValueRef isnan = lp_build_isnan(bld, b);
231 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
232 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
233 return lp_build_select(bld, cond, a, b);
234 }
235 break;
236 case GALLIVM_NAN_RETURN_OTHER: {
237 LLVMValueRef isnan = lp_build_isnan(bld, a);
238 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
239 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
240 return lp_build_select(bld, cond, a, b);
241 }
242 break;
243 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
244 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
245 return lp_build_select(bld, cond, a, b);
246 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
247 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
248 return lp_build_select(bld, cond, b, a);
249 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 break;
253 default:
254 assert(0);
255 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
256 return lp_build_select(bld, cond, a, b);
257 }
258 } else {
259 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
260 return lp_build_select(bld, cond, a, b);
261 }
262 }
263
264
265 /**
266 * Generate max(a, b)
267 * No checks for special case values of a or b = 1 or 0 are done.
268 * NaN's are handled according to the behavior specified by the
269 * nan_behavior argument.
270 */
271 static LLVMValueRef
272 lp_build_max_simple(struct lp_build_context *bld,
273 LLVMValueRef a,
274 LLVMValueRef b,
275 enum gallivm_nan_behavior nan_behavior)
276 {
277 const struct lp_type type = bld->type;
278 const char *intrinsic = NULL;
279 unsigned intr_size = 0;
280 LLVMValueRef cond;
281
282 assert(lp_check_value(type, a));
283 assert(lp_check_value(type, b));
284
285 /* TODO: optimize the constant case */
286
287 if (type.floating && util_cpu_caps.has_sse) {
288 if (type.width == 32) {
289 if (type.length == 1) {
290 intrinsic = "llvm.x86.sse.max.ss";
291 intr_size = 128;
292 }
293 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
294 intrinsic = "llvm.x86.sse.max.ps";
295 intr_size = 128;
296 }
297 else {
298 intrinsic = "llvm.x86.avx.max.ps.256";
299 intr_size = 256;
300 }
301 }
302 if (type.width == 64 && util_cpu_caps.has_sse2) {
303 if (type.length == 1) {
304 intrinsic = "llvm.x86.sse2.max.sd";
305 intr_size = 128;
306 }
307 else if (type.length == 2 || !util_cpu_caps.has_avx) {
308 intrinsic = "llvm.x86.sse2.max.pd";
309 intr_size = 128;
310 }
311 else {
312 intrinsic = "llvm.x86.avx.max.pd.256";
313 intr_size = 256;
314 }
315 }
316 }
317 else if (type.floating && util_cpu_caps.has_altivec) {
318 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
319 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
320 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
321 __FUNCTION__);
322 }
323 if (type.width == 32 || type.length == 4) {
324 intrinsic = "llvm.ppc.altivec.vmaxfp";
325 intr_size = 128;
326 }
327 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
328 intr_size = 128;
329 if ((type.width == 8 || type.width == 16) &&
330 (type.width * type.length <= 64) &&
331 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
332 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
333 __FUNCTION__);
334 }
335 if (type.width == 8 && !type.sign) {
336 intrinsic = "llvm.x86.sse2.pmaxu.b";
337 intr_size = 128;
338 }
339 else if (type.width == 16 && type.sign) {
340 intrinsic = "llvm.x86.sse2.pmaxs.w";
341 }
342 if (util_cpu_caps.has_sse4_1) {
343 if (type.width == 8 && type.sign) {
344 intrinsic = "llvm.x86.sse41.pmaxsb";
345 }
346 if (type.width == 16 && !type.sign) {
347 intrinsic = "llvm.x86.sse41.pmaxuw";
348 }
349 if (type.width == 32 && !type.sign) {
350 intrinsic = "llvm.x86.sse41.pmaxud";
351 }
352 if (type.width == 32 && type.sign) {
353 intrinsic = "llvm.x86.sse41.pmaxsd";
354 }
355 }
356 } else if (util_cpu_caps.has_altivec) {
357 intr_size = 128;
358 if (type.width == 8) {
359 if (!type.sign) {
360 intrinsic = "llvm.ppc.altivec.vmaxub";
361 } else {
362 intrinsic = "llvm.ppc.altivec.vmaxsb";
363 }
364 } else if (type.width == 16) {
365 if (!type.sign) {
366 intrinsic = "llvm.ppc.altivec.vmaxuh";
367 } else {
368 intrinsic = "llvm.ppc.altivec.vmaxsh";
369 }
370 } else if (type.width == 32) {
371 if (!type.sign) {
372 intrinsic = "llvm.ppc.altivec.vmaxuw";
373 } else {
374 intrinsic = "llvm.ppc.altivec.vmaxsw";
375 }
376 }
377 }
378
379 if (intrinsic) {
380 if (util_cpu_caps.has_sse && type.floating &&
381 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
382 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
383 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
384 LLVMValueRef isnan, max;
385 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
386 type,
387 intr_size, a, b);
388 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
389 isnan = lp_build_isnan(bld, b);
390 return lp_build_select(bld, isnan, a, max);
391 } else {
392 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
393 isnan = lp_build_isnan(bld, a);
394 return lp_build_select(bld, isnan, a, max);
395 }
396 } else {
397 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
398 type,
399 intr_size, a, b);
400 }
401 }
402
403 if (type.floating) {
404 switch (nan_behavior) {
405 case GALLIVM_NAN_RETURN_NAN: {
406 LLVMValueRef isnan = lp_build_isnan(bld, b);
407 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
408 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
409 return lp_build_select(bld, cond, a, b);
410 }
411 break;
412 case GALLIVM_NAN_RETURN_OTHER: {
413 LLVMValueRef isnan = lp_build_isnan(bld, a);
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
416 return lp_build_select(bld, cond, a, b);
417 }
418 break;
419 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
420 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
421 return lp_build_select(bld, cond, a, b);
422 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
423 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
424 return lp_build_select(bld, cond, b, a);
425 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
426 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
427 return lp_build_select(bld, cond, a, b);
428 break;
429 default:
430 assert(0);
431 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
432 return lp_build_select(bld, cond, a, b);
433 }
434 } else {
435 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
436 return lp_build_select(bld, cond, a, b);
437 }
438 }
439
440
441 /**
442 * Generate 1 - a, or ~a depending on bld->type.
443 */
444 LLVMValueRef
445 lp_build_comp(struct lp_build_context *bld,
446 LLVMValueRef a)
447 {
448 LLVMBuilderRef builder = bld->gallivm->builder;
449 const struct lp_type type = bld->type;
450
451 assert(lp_check_value(type, a));
452
453 if(a == bld->one)
454 return bld->zero;
455 if(a == bld->zero)
456 return bld->one;
457
458 if(type.norm && !type.floating && !type.fixed && !type.sign) {
459 if(LLVMIsConstant(a))
460 return LLVMConstNot(a);
461 else
462 return LLVMBuildNot(builder, a, "");
463 }
464
465 if(LLVMIsConstant(a))
466 if (type.floating)
467 return LLVMConstFSub(bld->one, a);
468 else
469 return LLVMConstSub(bld->one, a);
470 else
471 if (type.floating)
472 return LLVMBuildFSub(builder, bld->one, a, "");
473 else
474 return LLVMBuildSub(builder, bld->one, a, "");
475 }
476
477
478 /**
479 * Generate a + b
480 */
481 LLVMValueRef
482 lp_build_add(struct lp_build_context *bld,
483 LLVMValueRef a,
484 LLVMValueRef b)
485 {
486 LLVMBuilderRef builder = bld->gallivm->builder;
487 const struct lp_type type = bld->type;
488 LLVMValueRef res;
489
490 assert(lp_check_value(type, a));
491 assert(lp_check_value(type, b));
492
493 if(a == bld->zero)
494 return b;
495 if(b == bld->zero)
496 return a;
497 if(a == bld->undef || b == bld->undef)
498 return bld->undef;
499
500 if(bld->type.norm) {
501 const char *intrinsic = NULL;
502
503 if(a == bld->one || b == bld->one)
504 return bld->one;
505
506 if (type.width * type.length == 128 &&
507 !type.floating && !type.fixed) {
508 if(util_cpu_caps.has_sse2) {
509 if(type.width == 8)
510 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
511 if(type.width == 16)
512 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
513 } else if (util_cpu_caps.has_altivec) {
514 if(type.width == 8)
515 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
516 if(type.width == 16)
517 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
518 }
519 }
520
521 if (intrinsic)
522 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
523 }
524
525 if(type.norm && !type.floating && !type.fixed) {
526 if (type.sign) {
527 uint64_t sign = (uint64_t)1 << (type.width - 1);
528 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
529 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
530 /* a_clamp_max is the maximum a for positive b,
531 a_clamp_min is the minimum a for negative b. */
532 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
533 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
534 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
535 } else {
536 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
537 }
538 }
539
540 if(LLVMIsConstant(a) && LLVMIsConstant(b))
541 if (type.floating)
542 res = LLVMConstFAdd(a, b);
543 else
544 res = LLVMConstAdd(a, b);
545 else
546 if (type.floating)
547 res = LLVMBuildFAdd(builder, a, b, "");
548 else
549 res = LLVMBuildAdd(builder, a, b, "");
550
551 /* clamp to ceiling of 1.0 */
552 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
553 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
554
555 /* XXX clamp to floor of -1 or 0??? */
556
557 return res;
558 }
559
560
561 /** Return the scalar sum of the elements of a.
562 * Should avoid this operation whenever possible.
563 */
564 LLVMValueRef
565 lp_build_horizontal_add(struct lp_build_context *bld,
566 LLVMValueRef a)
567 {
568 LLVMBuilderRef builder = bld->gallivm->builder;
569 const struct lp_type type = bld->type;
570 LLVMValueRef index, res;
571 unsigned i, length;
572 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
573 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
574 LLVMValueRef vecres, elem2;
575
576 assert(lp_check_value(type, a));
577
578 if (type.length == 1) {
579 return a;
580 }
581
582 assert(!bld->type.norm);
583
584 /*
585 * for byte vectors can do much better with psadbw.
586 * Using repeated shuffle/adds here. Note with multiple vectors
587 * this can be done more efficiently as outlined in the intel
588 * optimization manual.
589 * Note: could cause data rearrangement if used with smaller element
590 * sizes.
591 */
592
593 vecres = a;
594 length = type.length / 2;
595 while (length > 1) {
596 LLVMValueRef vec1, vec2;
597 for (i = 0; i < length; i++) {
598 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
599 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
600 }
601 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
602 LLVMConstVector(shuffles1, length), "");
603 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
604 LLVMConstVector(shuffles2, length), "");
605 if (type.floating) {
606 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
607 }
608 else {
609 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
610 }
611 length = length >> 1;
612 }
613
614 /* always have vector of size 2 here */
615 assert(length == 1);
616
617 index = lp_build_const_int32(bld->gallivm, 0);
618 res = LLVMBuildExtractElement(builder, vecres, index, "");
619 index = lp_build_const_int32(bld->gallivm, 1);
620 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
621
622 if (type.floating)
623 res = LLVMBuildFAdd(builder, res, elem2, "");
624 else
625 res = LLVMBuildAdd(builder, res, elem2, "");
626
627 return res;
628 }
629
630 /**
631 * Return the horizontal sums of 4 float vectors as a float4 vector.
632 * This uses the technique as outlined in Intel Optimization Manual.
633 */
634 static LLVMValueRef
635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
636 LLVMValueRef src[4])
637 {
638 struct gallivm_state *gallivm = bld->gallivm;
639 LLVMBuilderRef builder = gallivm->builder;
640 LLVMValueRef shuffles[4];
641 LLVMValueRef tmp[4];
642 LLVMValueRef sumtmp[2], shuftmp[2];
643
644 /* lower half of regs */
645 shuffles[0] = lp_build_const_int32(gallivm, 0);
646 shuffles[1] = lp_build_const_int32(gallivm, 1);
647 shuffles[2] = lp_build_const_int32(gallivm, 4);
648 shuffles[3] = lp_build_const_int32(gallivm, 5);
649 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
650 LLVMConstVector(shuffles, 4), "");
651 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
652 LLVMConstVector(shuffles, 4), "");
653
654 /* upper half of regs */
655 shuffles[0] = lp_build_const_int32(gallivm, 2);
656 shuffles[1] = lp_build_const_int32(gallivm, 3);
657 shuffles[2] = lp_build_const_int32(gallivm, 6);
658 shuffles[3] = lp_build_const_int32(gallivm, 7);
659 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
660 LLVMConstVector(shuffles, 4), "");
661 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
662 LLVMConstVector(shuffles, 4), "");
663
664 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
665 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
666
667 shuffles[0] = lp_build_const_int32(gallivm, 0);
668 shuffles[1] = lp_build_const_int32(gallivm, 2);
669 shuffles[2] = lp_build_const_int32(gallivm, 4);
670 shuffles[3] = lp_build_const_int32(gallivm, 6);
671 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672 LLVMConstVector(shuffles, 4), "");
673
674 shuffles[0] = lp_build_const_int32(gallivm, 1);
675 shuffles[1] = lp_build_const_int32(gallivm, 3);
676 shuffles[2] = lp_build_const_int32(gallivm, 5);
677 shuffles[3] = lp_build_const_int32(gallivm, 7);
678 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
679 LLVMConstVector(shuffles, 4), "");
680
681 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
682 }
683
684
685 /*
686 * partially horizontally add 2-4 float vectors with length nx4,
687 * i.e. only four adjacent values in each vector will be added,
688 * assuming values are really grouped in 4 which also determines
689 * output order.
690 *
691 * Return a vector of the same length as the initial vectors,
692 * with the excess elements (if any) being undefined.
693 * The element order is independent of number of input vectors.
694 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
695 * the output order thus will be
696 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
697 */
698 LLVMValueRef
699 lp_build_hadd_partial4(struct lp_build_context *bld,
700 LLVMValueRef vectors[],
701 unsigned num_vecs)
702 {
703 struct gallivm_state *gallivm = bld->gallivm;
704 LLVMBuilderRef builder = gallivm->builder;
705 LLVMValueRef ret_vec;
706 LLVMValueRef tmp[4];
707 const char *intrinsic = NULL;
708
709 assert(num_vecs >= 2 && num_vecs <= 4);
710 assert(bld->type.floating);
711
712 /* only use this with at least 2 vectors, as it is sort of expensive
713 * (depending on cpu) and we always need two horizontal adds anyway,
714 * so a shuffle/add approach might be better.
715 */
716
717 tmp[0] = vectors[0];
718 tmp[1] = vectors[1];
719
720 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
721 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
722
723 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
724 bld->type.length == 4) {
725 intrinsic = "llvm.x86.sse3.hadd.ps";
726 }
727 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
728 bld->type.length == 8) {
729 intrinsic = "llvm.x86.avx.hadd.ps.256";
730 }
731 if (intrinsic) {
732 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
733 lp_build_vec_type(gallivm, bld->type),
734 tmp[0], tmp[1]);
735 if (num_vecs > 2) {
736 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
737 lp_build_vec_type(gallivm, bld->type),
738 tmp[2], tmp[3]);
739 }
740 else {
741 tmp[1] = tmp[0];
742 }
743 return lp_build_intrinsic_binary(builder, intrinsic,
744 lp_build_vec_type(gallivm, bld->type),
745 tmp[0], tmp[1]);
746 }
747
748 if (bld->type.length == 4) {
749 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
750 }
751 else {
752 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
753 unsigned j;
754 unsigned num_iter = bld->type.length / 4;
755 struct lp_type parttype = bld->type;
756 parttype.length = 4;
757 for (j = 0; j < num_iter; j++) {
758 LLVMValueRef partsrc[4];
759 unsigned i;
760 for (i = 0; i < 4; i++) {
761 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
762 }
763 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
764 }
765 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
766 }
767 return ret_vec;
768 }
769
770 /**
771 * Generate a - b
772 */
773 LLVMValueRef
774 lp_build_sub(struct lp_build_context *bld,
775 LLVMValueRef a,
776 LLVMValueRef b)
777 {
778 LLVMBuilderRef builder = bld->gallivm->builder;
779 const struct lp_type type = bld->type;
780 LLVMValueRef res;
781
782 assert(lp_check_value(type, a));
783 assert(lp_check_value(type, b));
784
785 if(b == bld->zero)
786 return a;
787 if(a == bld->undef || b == bld->undef)
788 return bld->undef;
789 if(a == b)
790 return bld->zero;
791
792 if(bld->type.norm) {
793 const char *intrinsic = NULL;
794
795 if(b == bld->one)
796 return bld->zero;
797
798 if (type.width * type.length == 128 &&
799 !type.floating && !type.fixed) {
800 if (util_cpu_caps.has_sse2) {
801 if(type.width == 8)
802 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
803 if(type.width == 16)
804 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
805 } else if (util_cpu_caps.has_altivec) {
806 if(type.width == 8)
807 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
808 if(type.width == 16)
809 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
810 }
811 }
812
813 if (intrinsic)
814 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
815 }
816
817 if(type.norm && !type.floating && !type.fixed) {
818 if (type.sign) {
819 uint64_t sign = (uint64_t)1 << (type.width - 1);
820 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
821 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
822 /* a_clamp_max is the maximum a for negative b,
823 a_clamp_min is the minimum a for positive b. */
824 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
825 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
826 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
827 } else {
828 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
829 }
830 }
831
832 if(LLVMIsConstant(a) && LLVMIsConstant(b))
833 if (type.floating)
834 res = LLVMConstFSub(a, b);
835 else
836 res = LLVMConstSub(a, b);
837 else
838 if (type.floating)
839 res = LLVMBuildFSub(builder, a, b, "");
840 else
841 res = LLVMBuildSub(builder, a, b, "");
842
843 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
844 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
845
846 return res;
847 }
848
849
850
851 /**
852 * Normalized multiplication.
853 *
854 * There are several approaches for (using 8-bit normalized multiplication as
855 * an example):
856 *
857 * - alpha plus one
858 *
859 * makes the following approximation to the division (Sree)
860 *
861 * a*b/255 ~= (a*(b + 1)) >> 256
862 *
863 * which is the fastest method that satisfies the following OpenGL criteria of
864 *
865 * 0*0 = 0 and 255*255 = 255
866 *
867 * - geometric series
868 *
869 * takes the geometric series approximation to the division
870 *
871 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
872 *
873 * in this case just the first two terms to fit in 16bit arithmetic
874 *
875 * t/255 ~= (t + (t >> 8)) >> 8
876 *
877 * note that just by itself it doesn't satisfies the OpenGL criteria, as
878 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
879 * must be used.
880 *
881 * - geometric series plus rounding
882 *
883 * when using a geometric series division instead of truncating the result
884 * use roundoff in the approximation (Jim Blinn)
885 *
886 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
887 *
888 * achieving the exact results.
889 *
890 *
891 *
892 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
893 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
894 * @sa Michael Herf, The "double blend trick", May 2000,
895 * http://www.stereopsis.com/doubleblend.html
896 */
897 static LLVMValueRef
898 lp_build_mul_norm(struct gallivm_state *gallivm,
899 struct lp_type wide_type,
900 LLVMValueRef a, LLVMValueRef b)
901 {
902 LLVMBuilderRef builder = gallivm->builder;
903 struct lp_build_context bld;
904 unsigned n;
905 LLVMValueRef half;
906 LLVMValueRef ab;
907
908 assert(!wide_type.floating);
909 assert(lp_check_value(wide_type, a));
910 assert(lp_check_value(wide_type, b));
911
912 lp_build_context_init(&bld, gallivm, wide_type);
913
914 n = wide_type.width / 2;
915 if (wide_type.sign) {
916 --n;
917 }
918
919 /*
920 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
921 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
922 */
923
924 /*
925 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
926 */
927
928 ab = LLVMBuildMul(builder, a, b, "");
929 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
930
931 /*
932 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
933 */
934
935 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
936 if (wide_type.sign) {
937 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
938 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
939 half = lp_build_select(&bld, sign, minus_half, half);
940 }
941 ab = LLVMBuildAdd(builder, ab, half, "");
942
943 /* Final division */
944 ab = lp_build_shr_imm(&bld, ab, n);
945
946 return ab;
947 }
948
949 /**
950 * Generate a * b
951 */
952 LLVMValueRef
953 lp_build_mul(struct lp_build_context *bld,
954 LLVMValueRef a,
955 LLVMValueRef b)
956 {
957 LLVMBuilderRef builder = bld->gallivm->builder;
958 const struct lp_type type = bld->type;
959 LLVMValueRef shift;
960 LLVMValueRef res;
961
962 assert(lp_check_value(type, a));
963 assert(lp_check_value(type, b));
964
965 if(a == bld->zero)
966 return bld->zero;
967 if(a == bld->one)
968 return b;
969 if(b == bld->zero)
970 return bld->zero;
971 if(b == bld->one)
972 return a;
973 if(a == bld->undef || b == bld->undef)
974 return bld->undef;
975
976 if (!type.floating && !type.fixed && type.norm) {
977 struct lp_type wide_type = lp_wider_type(type);
978 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
979
980 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
981 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
982
983 /* PMULLW, PSRLW, PADDW */
984 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
985 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
986
987 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
988
989 return ab;
990 }
991
992 if(type.fixed)
993 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
994 else
995 shift = NULL;
996
997 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
998 if (type.floating)
999 res = LLVMConstFMul(a, b);
1000 else
1001 res = LLVMConstMul(a, b);
1002 if(shift) {
1003 if(type.sign)
1004 res = LLVMConstAShr(res, shift);
1005 else
1006 res = LLVMConstLShr(res, shift);
1007 }
1008 }
1009 else {
1010 if (type.floating)
1011 res = LLVMBuildFMul(builder, a, b, "");
1012 else
1013 res = LLVMBuildMul(builder, a, b, "");
1014 if(shift) {
1015 if(type.sign)
1016 res = LLVMBuildAShr(builder, res, shift, "");
1017 else
1018 res = LLVMBuildLShr(builder, res, shift, "");
1019 }
1020 }
1021
1022 return res;
1023 }
1024
1025
1026 /**
1027 * Small vector x scale multiplication optimization.
1028 */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031 LLVMValueRef a,
1032 int b)
1033 {
1034 LLVMBuilderRef builder = bld->gallivm->builder;
1035 LLVMValueRef factor;
1036
1037 assert(lp_check_value(bld->type, a));
1038
1039 if(b == 0)
1040 return bld->zero;
1041
1042 if(b == 1)
1043 return a;
1044
1045 if(b == -1)
1046 return lp_build_negate(bld, a);
1047
1048 if(b == 2 && bld->type.floating)
1049 return lp_build_add(bld, a, a);
1050
1051 if(util_is_power_of_two(b)) {
1052 unsigned shift = ffs(b) - 1;
1053
1054 if(bld->type.floating) {
1055 #if 0
1056 /*
1057 * Power of two multiplication by directly manipulating the exponent.
1058 *
1059 * XXX: This might not be always faster, it will introduce a small error
1060 * for multiplication by zero, and it will produce wrong results
1061 * for Inf and NaN.
1062 */
1063 unsigned mantissa = lp_mantissa(bld->type);
1064 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066 a = LLVMBuildAdd(builder, a, factor, "");
1067 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068 return a;
1069 #endif
1070 }
1071 else {
1072 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073 return LLVMBuildShl(builder, a, factor, "");
1074 }
1075 }
1076
1077 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078 return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083 * Generate a / b
1084 */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087 LLVMValueRef a,
1088 LLVMValueRef b)
1089 {
1090 LLVMBuilderRef builder = bld->gallivm->builder;
1091 const struct lp_type type = bld->type;
1092
1093 assert(lp_check_value(type, a));
1094 assert(lp_check_value(type, b));
1095
1096 if(a == bld->zero)
1097 return bld->zero;
1098 if(a == bld->one && type.floating)
1099 return lp_build_rcp(bld, b);
1100 if(b == bld->zero)
1101 return bld->undef;
1102 if(b == bld->one)
1103 return a;
1104 if(a == bld->undef || b == bld->undef)
1105 return bld->undef;
1106
1107 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108 if (type.floating)
1109 return LLVMConstFDiv(a, b);
1110 else if (type.sign)
1111 return LLVMConstSDiv(a, b);
1112 else
1113 return LLVMConstUDiv(a, b);
1114 }
1115
1116 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118 type.floating)
1119 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121 if (type.floating)
1122 return LLVMBuildFDiv(builder, a, b, "");
1123 else if (type.sign)
1124 return LLVMBuildSDiv(builder, a, b, "");
1125 else
1126 return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131 * Linear interpolation helper.
1132 *
1133 * @param normalized whether we are interpolating normalized values,
1134 * encoded in normalized integers, twice as wide.
1135 *
1136 * @sa http://www.stereopsis.com/doubleblend.html
1137 */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140 LLVMValueRef x,
1141 LLVMValueRef v0,
1142 LLVMValueRef v1,
1143 unsigned flags)
1144 {
1145 unsigned half_width = bld->type.width/2;
1146 LLVMBuilderRef builder = bld->gallivm->builder;
1147 LLVMValueRef delta;
1148 LLVMValueRef res;
1149
1150 assert(lp_check_value(bld->type, x));
1151 assert(lp_check_value(bld->type, v0));
1152 assert(lp_check_value(bld->type, v1));
1153
1154 delta = lp_build_sub(bld, v1, v0);
1155
1156 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157 if (!bld->type.sign) {
1158 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159 /*
1160 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161 * most-significant-bit to the lowest-significant-bit, so that
1162 * later we can just divide by 2**n instead of 2**n - 1.
1163 */
1164
1165 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166 }
1167
1168 /* (x * delta) >> n */
1169 res = lp_build_mul(bld, x, delta);
1170 res = lp_build_shr_imm(bld, res, half_width);
1171 } else {
1172 /*
1173 * The rescaling trick above doesn't work for signed numbers, so
1174 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175 * instead.
1176 */
1177 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179 }
1180 } else {
1181 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182 res = lp_build_mul(bld, x, delta);
1183 }
1184
1185 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1186 /*
1187 * At this point both res and v0 only use the lower half of the bits,
1188 * the rest is zero. Instead of add / mask, do add with half wide type.
1189 */
1190 struct lp_type narrow_type;
1191 struct lp_build_context narrow_bld;
1192
1193 memset(&narrow_type, 0, sizeof narrow_type);
1194 narrow_type.sign = bld->type.sign;
1195 narrow_type.width = bld->type.width/2;
1196 narrow_type.length = bld->type.length*2;
1197
1198 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1199 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1200 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1201 res = lp_build_add(&narrow_bld, v0, res);
1202 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1203 } else {
1204 res = lp_build_add(bld, v0, res);
1205
1206 if (bld->type.fixed) {
1207 /*
1208 * We need to mask out the high order bits when lerping 8bit
1209 * normalized colors stored on 16bits
1210 */
1211 /* XXX: This step is necessary for lerping 8bit colors stored on
1212 * 16bits, but it will be wrong for true fixed point use cases.
1213 * Basically we need a more powerful lp_type, capable of further
1214 * distinguishing the values interpretation from the value storage.
1215 */
1216 LLVMValueRef low_bits;
1217 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1218 res = LLVMBuildAnd(builder, res, low_bits, "");
1219 }
1220 }
1221
1222 return res;
1223 }
1224
1225
1226 /**
1227 * Linear interpolation.
1228 */
1229 LLVMValueRef
1230 lp_build_lerp(struct lp_build_context *bld,
1231 LLVMValueRef x,
1232 LLVMValueRef v0,
1233 LLVMValueRef v1,
1234 unsigned flags)
1235 {
1236 const struct lp_type type = bld->type;
1237 LLVMValueRef res;
1238
1239 assert(lp_check_value(type, x));
1240 assert(lp_check_value(type, v0));
1241 assert(lp_check_value(type, v1));
1242
1243 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1244
1245 if (type.norm) {
1246 struct lp_type wide_type;
1247 struct lp_build_context wide_bld;
1248 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1249
1250 assert(type.length >= 2);
1251
1252 /*
1253 * Create a wider integer type, enough to hold the
1254 * intermediate result of the multiplication.
1255 */
1256 memset(&wide_type, 0, sizeof wide_type);
1257 wide_type.sign = type.sign;
1258 wide_type.width = type.width*2;
1259 wide_type.length = type.length/2;
1260
1261 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1262
1263 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1264 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1265 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1266
1267 /*
1268 * Lerp both halves.
1269 */
1270
1271 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1272
1273 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1274 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1275
1276 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1277 } else {
1278 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1279 }
1280
1281 return res;
1282 }
1283
1284
1285 /**
1286 * Bilinear interpolation.
1287 *
1288 * Values indices are in v_{yx}.
1289 */
1290 LLVMValueRef
1291 lp_build_lerp_2d(struct lp_build_context *bld,
1292 LLVMValueRef x,
1293 LLVMValueRef y,
1294 LLVMValueRef v00,
1295 LLVMValueRef v01,
1296 LLVMValueRef v10,
1297 LLVMValueRef v11,
1298 unsigned flags)
1299 {
1300 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1301 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1302 return lp_build_lerp(bld, y, v0, v1, flags);
1303 }
1304
1305
1306 LLVMValueRef
1307 lp_build_lerp_3d(struct lp_build_context *bld,
1308 LLVMValueRef x,
1309 LLVMValueRef y,
1310 LLVMValueRef z,
1311 LLVMValueRef v000,
1312 LLVMValueRef v001,
1313 LLVMValueRef v010,
1314 LLVMValueRef v011,
1315 LLVMValueRef v100,
1316 LLVMValueRef v101,
1317 LLVMValueRef v110,
1318 LLVMValueRef v111,
1319 unsigned flags)
1320 {
1321 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1322 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1323 return lp_build_lerp(bld, z, v0, v1, flags);
1324 }
1325
1326
1327 /**
1328 * Generate min(a, b)
1329 * Do checks for special cases but not for nans.
1330 */
1331 LLVMValueRef
1332 lp_build_min(struct lp_build_context *bld,
1333 LLVMValueRef a,
1334 LLVMValueRef b)
1335 {
1336 assert(lp_check_value(bld->type, a));
1337 assert(lp_check_value(bld->type, b));
1338
1339 if(a == bld->undef || b == bld->undef)
1340 return bld->undef;
1341
1342 if(a == b)
1343 return a;
1344
1345 if (bld->type.norm) {
1346 if (!bld->type.sign) {
1347 if (a == bld->zero || b == bld->zero) {
1348 return bld->zero;
1349 }
1350 }
1351 if(a == bld->one)
1352 return b;
1353 if(b == bld->one)
1354 return a;
1355 }
1356
1357 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1358 }
1359
1360
1361 /**
1362 * Generate min(a, b)
1363 * NaN's are handled according to the behavior specified by the
1364 * nan_behavior argument.
1365 */
1366 LLVMValueRef
1367 lp_build_min_ext(struct lp_build_context *bld,
1368 LLVMValueRef a,
1369 LLVMValueRef b,
1370 enum gallivm_nan_behavior nan_behavior)
1371 {
1372 assert(lp_check_value(bld->type, a));
1373 assert(lp_check_value(bld->type, b));
1374
1375 if(a == bld->undef || b == bld->undef)
1376 return bld->undef;
1377
1378 if(a == b)
1379 return a;
1380
1381 if (bld->type.norm) {
1382 if (!bld->type.sign) {
1383 if (a == bld->zero || b == bld->zero) {
1384 return bld->zero;
1385 }
1386 }
1387 if(a == bld->one)
1388 return b;
1389 if(b == bld->one)
1390 return a;
1391 }
1392
1393 return lp_build_min_simple(bld, a, b, nan_behavior);
1394 }
1395
1396 /**
1397 * Generate max(a, b)
1398 * Do checks for special cases, but NaN behavior is undefined.
1399 */
1400 LLVMValueRef
1401 lp_build_max(struct lp_build_context *bld,
1402 LLVMValueRef a,
1403 LLVMValueRef b)
1404 {
1405 assert(lp_check_value(bld->type, a));
1406 assert(lp_check_value(bld->type, b));
1407
1408 if(a == bld->undef || b == bld->undef)
1409 return bld->undef;
1410
1411 if(a == b)
1412 return a;
1413
1414 if(bld->type.norm) {
1415 if(a == bld->one || b == bld->one)
1416 return bld->one;
1417 if (!bld->type.sign) {
1418 if (a == bld->zero) {
1419 return b;
1420 }
1421 if (b == bld->zero) {
1422 return a;
1423 }
1424 }
1425 }
1426
1427 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1428 }
1429
1430
1431 /**
1432 * Generate max(a, b)
1433 * Checks for special cases.
1434 * NaN's are handled according to the behavior specified by the
1435 * nan_behavior argument.
1436 */
1437 LLVMValueRef
1438 lp_build_max_ext(struct lp_build_context *bld,
1439 LLVMValueRef a,
1440 LLVMValueRef b,
1441 enum gallivm_nan_behavior nan_behavior)
1442 {
1443 assert(lp_check_value(bld->type, a));
1444 assert(lp_check_value(bld->type, b));
1445
1446 if(a == bld->undef || b == bld->undef)
1447 return bld->undef;
1448
1449 if(a == b)
1450 return a;
1451
1452 if(bld->type.norm) {
1453 if(a == bld->one || b == bld->one)
1454 return bld->one;
1455 if (!bld->type.sign) {
1456 if (a == bld->zero) {
1457 return b;
1458 }
1459 if (b == bld->zero) {
1460 return a;
1461 }
1462 }
1463 }
1464
1465 return lp_build_max_simple(bld, a, b, nan_behavior);
1466 }
1467
1468 /**
1469 * Generate clamp(a, min, max)
1470 * NaN behavior (for any of a, min, max) is undefined.
1471 * Do checks for special cases.
1472 */
1473 LLVMValueRef
1474 lp_build_clamp(struct lp_build_context *bld,
1475 LLVMValueRef a,
1476 LLVMValueRef min,
1477 LLVMValueRef max)
1478 {
1479 assert(lp_check_value(bld->type, a));
1480 assert(lp_check_value(bld->type, min));
1481 assert(lp_check_value(bld->type, max));
1482
1483 a = lp_build_min(bld, a, max);
1484 a = lp_build_max(bld, a, min);
1485 return a;
1486 }
1487
1488
1489 /**
1490 * Generate clamp(a, 0, 1)
1491 * A NaN will get converted to zero.
1492 */
1493 LLVMValueRef
1494 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1495 LLVMValueRef a)
1496 {
1497 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1498 a = lp_build_min(bld, a, bld->one);
1499 return a;
1500 }
1501
1502
1503 /**
1504 * Generate abs(a)
1505 */
1506 LLVMValueRef
1507 lp_build_abs(struct lp_build_context *bld,
1508 LLVMValueRef a)
1509 {
1510 LLVMBuilderRef builder = bld->gallivm->builder;
1511 const struct lp_type type = bld->type;
1512 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1513
1514 assert(lp_check_value(type, a));
1515
1516 if(!type.sign)
1517 return a;
1518
1519 if(type.floating) {
1520 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1521 /* Workaround llvm.org/PR27332 */
1522 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1523 unsigned long long absMask = ~(1ULL << (type.width - 1));
1524 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1525 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1526 a = LLVMBuildAnd(builder, a, mask, "");
1527 a = LLVMBuildBitCast(builder, a, vec_type, "");
1528 return a;
1529 } else {
1530 char intrinsic[32];
1531 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1532 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1533 }
1534 }
1535
1536 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1537 switch(type.width) {
1538 case 8:
1539 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1540 case 16:
1541 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1542 case 32:
1543 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1544 }
1545 }
1546 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1547 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1548 (type.width == 8 || type.width == 16 || type.width == 32)) {
1549 debug_printf("%s: inefficient code, should split vectors manually\n",
1550 __FUNCTION__);
1551 }
1552
1553 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1554 }
1555
1556
1557 LLVMValueRef
1558 lp_build_negate(struct lp_build_context *bld,
1559 LLVMValueRef a)
1560 {
1561 LLVMBuilderRef builder = bld->gallivm->builder;
1562
1563 assert(lp_check_value(bld->type, a));
1564
1565 if (bld->type.floating)
1566 a = LLVMBuildFNeg(builder, a, "");
1567 else
1568 a = LLVMBuildNeg(builder, a, "");
1569
1570 return a;
1571 }
1572
1573
1574 /** Return -1, 0 or +1 depending on the sign of a */
1575 LLVMValueRef
1576 lp_build_sgn(struct lp_build_context *bld,
1577 LLVMValueRef a)
1578 {
1579 LLVMBuilderRef builder = bld->gallivm->builder;
1580 const struct lp_type type = bld->type;
1581 LLVMValueRef cond;
1582 LLVMValueRef res;
1583
1584 assert(lp_check_value(type, a));
1585
1586 /* Handle non-zero case */
1587 if(!type.sign) {
1588 /* if not zero then sign must be positive */
1589 res = bld->one;
1590 }
1591 else if(type.floating) {
1592 LLVMTypeRef vec_type;
1593 LLVMTypeRef int_type;
1594 LLVMValueRef mask;
1595 LLVMValueRef sign;
1596 LLVMValueRef one;
1597 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1598
1599 int_type = lp_build_int_vec_type(bld->gallivm, type);
1600 vec_type = lp_build_vec_type(bld->gallivm, type);
1601 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1602
1603 /* Take the sign bit and add it to 1 constant */
1604 sign = LLVMBuildBitCast(builder, a, int_type, "");
1605 sign = LLVMBuildAnd(builder, sign, mask, "");
1606 one = LLVMConstBitCast(bld->one, int_type);
1607 res = LLVMBuildOr(builder, sign, one, "");
1608 res = LLVMBuildBitCast(builder, res, vec_type, "");
1609 }
1610 else
1611 {
1612 /* signed int/norm/fixed point */
1613 /* could use psign with sse3 and appropriate vectors here */
1614 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1615 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1616 res = lp_build_select(bld, cond, bld->one, minus_one);
1617 }
1618
1619 /* Handle zero */
1620 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1621 res = lp_build_select(bld, cond, bld->zero, res);
1622
1623 return res;
1624 }
1625
1626
1627 /**
1628 * Set the sign of float vector 'a' according to 'sign'.
1629 * If sign==0, return abs(a).
1630 * If sign==1, return -abs(a);
1631 * Other values for sign produce undefined results.
1632 */
1633 LLVMValueRef
1634 lp_build_set_sign(struct lp_build_context *bld,
1635 LLVMValueRef a, LLVMValueRef sign)
1636 {
1637 LLVMBuilderRef builder = bld->gallivm->builder;
1638 const struct lp_type type = bld->type;
1639 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1640 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1641 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1642 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1643 ~((unsigned long long) 1 << (type.width - 1)));
1644 LLVMValueRef val, res;
1645
1646 assert(type.floating);
1647 assert(lp_check_value(type, a));
1648
1649 /* val = reinterpret_cast<int>(a) */
1650 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1651 /* val = val & mask */
1652 val = LLVMBuildAnd(builder, val, mask, "");
1653 /* sign = sign << shift */
1654 sign = LLVMBuildShl(builder, sign, shift, "");
1655 /* res = val | sign */
1656 res = LLVMBuildOr(builder, val, sign, "");
1657 /* res = reinterpret_cast<float>(res) */
1658 res = LLVMBuildBitCast(builder, res, vec_type, "");
1659
1660 return res;
1661 }
1662
1663
1664 /**
1665 * Convert vector of (or scalar) int to vector of (or scalar) float.
1666 */
1667 LLVMValueRef
1668 lp_build_int_to_float(struct lp_build_context *bld,
1669 LLVMValueRef a)
1670 {
1671 LLVMBuilderRef builder = bld->gallivm->builder;
1672 const struct lp_type type = bld->type;
1673 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1674
1675 assert(type.floating);
1676
1677 return LLVMBuildSIToFP(builder, a, vec_type, "");
1678 }
1679
1680 static boolean
1681 arch_rounding_available(const struct lp_type type)
1682 {
1683 if ((util_cpu_caps.has_sse4_1 &&
1684 (type.length == 1 || type.width*type.length == 128)) ||
1685 (util_cpu_caps.has_avx && type.width*type.length == 256))
1686 return TRUE;
1687 else if ((util_cpu_caps.has_altivec &&
1688 (type.width == 32 && type.length == 4)))
1689 return TRUE;
1690
1691 return FALSE;
1692 }
1693
1694 enum lp_build_round_mode
1695 {
1696 LP_BUILD_ROUND_NEAREST = 0,
1697 LP_BUILD_ROUND_FLOOR = 1,
1698 LP_BUILD_ROUND_CEIL = 2,
1699 LP_BUILD_ROUND_TRUNCATE = 3
1700 };
1701
1702 static inline LLVMValueRef
1703 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1704 LLVMValueRef a)
1705 {
1706 LLVMBuilderRef builder = bld->gallivm->builder;
1707 const struct lp_type type = bld->type;
1708 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1709 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1710 const char *intrinsic;
1711 LLVMValueRef res;
1712
1713 assert(type.floating);
1714 /* using the double precision conversions is a bit more complicated */
1715 assert(type.width == 32);
1716
1717 assert(lp_check_value(type, a));
1718 assert(util_cpu_caps.has_sse2);
1719
1720 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1721 if (type.length == 1) {
1722 LLVMTypeRef vec_type;
1723 LLVMValueRef undef;
1724 LLVMValueRef arg;
1725 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1726
1727 vec_type = LLVMVectorType(bld->elem_type, 4);
1728
1729 intrinsic = "llvm.x86.sse.cvtss2si";
1730
1731 undef = LLVMGetUndef(vec_type);
1732
1733 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1734
1735 res = lp_build_intrinsic_unary(builder, intrinsic,
1736 ret_type, arg);
1737 }
1738 else {
1739 if (type.width* type.length == 128) {
1740 intrinsic = "llvm.x86.sse2.cvtps2dq";
1741 }
1742 else {
1743 assert(type.width*type.length == 256);
1744 assert(util_cpu_caps.has_avx);
1745
1746 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1747 }
1748 res = lp_build_intrinsic_unary(builder, intrinsic,
1749 ret_type, a);
1750 }
1751
1752 return res;
1753 }
1754
1755
1756 /*
1757 */
1758 static inline LLVMValueRef
1759 lp_build_round_altivec(struct lp_build_context *bld,
1760 LLVMValueRef a,
1761 enum lp_build_round_mode mode)
1762 {
1763 LLVMBuilderRef builder = bld->gallivm->builder;
1764 const struct lp_type type = bld->type;
1765 const char *intrinsic = NULL;
1766
1767 assert(type.floating);
1768
1769 assert(lp_check_value(type, a));
1770 assert(util_cpu_caps.has_altivec);
1771
1772 (void)type;
1773
1774 switch (mode) {
1775 case LP_BUILD_ROUND_NEAREST:
1776 intrinsic = "llvm.ppc.altivec.vrfin";
1777 break;
1778 case LP_BUILD_ROUND_FLOOR:
1779 intrinsic = "llvm.ppc.altivec.vrfim";
1780 break;
1781 case LP_BUILD_ROUND_CEIL:
1782 intrinsic = "llvm.ppc.altivec.vrfip";
1783 break;
1784 case LP_BUILD_ROUND_TRUNCATE:
1785 intrinsic = "llvm.ppc.altivec.vrfiz";
1786 break;
1787 }
1788
1789 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1790 }
1791
1792 static inline LLVMValueRef
1793 lp_build_round_arch(struct lp_build_context *bld,
1794 LLVMValueRef a,
1795 enum lp_build_round_mode mode)
1796 {
1797 if (util_cpu_caps.has_sse4_1) {
1798 LLVMBuilderRef builder = bld->gallivm->builder;
1799 const struct lp_type type = bld->type;
1800 const char *intrinsic_root;
1801 char intrinsic[32];
1802
1803 assert(type.floating);
1804 assert(lp_check_value(type, a));
1805 (void)type;
1806
1807 switch (mode) {
1808 case LP_BUILD_ROUND_NEAREST:
1809 intrinsic_root = "llvm.nearbyint";
1810 break;
1811 case LP_BUILD_ROUND_FLOOR:
1812 intrinsic_root = "llvm.floor";
1813 break;
1814 case LP_BUILD_ROUND_CEIL:
1815 intrinsic_root = "llvm.ceil";
1816 break;
1817 case LP_BUILD_ROUND_TRUNCATE:
1818 intrinsic_root = "llvm.trunc";
1819 break;
1820 }
1821
1822 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1823 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1824 }
1825 else /* (util_cpu_caps.has_altivec) */
1826 return lp_build_round_altivec(bld, a, mode);
1827 }
1828
1829 /**
1830 * Return the integer part of a float (vector) value (== round toward zero).
1831 * The returned value is a float (vector).
1832 * Ex: trunc(-1.5) = -1.0
1833 */
1834 LLVMValueRef
1835 lp_build_trunc(struct lp_build_context *bld,
1836 LLVMValueRef a)
1837 {
1838 LLVMBuilderRef builder = bld->gallivm->builder;
1839 const struct lp_type type = bld->type;
1840
1841 assert(type.floating);
1842 assert(lp_check_value(type, a));
1843
1844 if (arch_rounding_available(type)) {
1845 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1846 }
1847 else {
1848 const struct lp_type type = bld->type;
1849 struct lp_type inttype;
1850 struct lp_build_context intbld;
1851 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1852 LLVMValueRef trunc, res, anosign, mask;
1853 LLVMTypeRef int_vec_type = bld->int_vec_type;
1854 LLVMTypeRef vec_type = bld->vec_type;
1855
1856 assert(type.width == 32); /* might want to handle doubles at some point */
1857
1858 inttype = type;
1859 inttype.floating = 0;
1860 lp_build_context_init(&intbld, bld->gallivm, inttype);
1861
1862 /* round by truncation */
1863 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1864 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1865
1866 /* mask out sign bit */
1867 anosign = lp_build_abs(bld, a);
1868 /*
1869 * mask out all values if anosign > 2^24
1870 * This should work both for large ints (all rounding is no-op for them
1871 * because such floats are always exact) as well as special cases like
1872 * NaNs, Infs (taking advantage of the fact they use max exponent).
1873 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1874 */
1875 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1876 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1877 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1878 return lp_build_select(bld, mask, a, res);
1879 }
1880 }
1881
1882
1883 /**
1884 * Return float (vector) rounded to nearest integer (vector). The returned
1885 * value is a float (vector).
1886 * Ex: round(0.9) = 1.0
1887 * Ex: round(-1.5) = -2.0
1888 */
1889 LLVMValueRef
1890 lp_build_round(struct lp_build_context *bld,
1891 LLVMValueRef a)
1892 {
1893 LLVMBuilderRef builder = bld->gallivm->builder;
1894 const struct lp_type type = bld->type;
1895
1896 assert(type.floating);
1897 assert(lp_check_value(type, a));
1898
1899 if (arch_rounding_available(type)) {
1900 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1901 }
1902 else {
1903 const struct lp_type type = bld->type;
1904 struct lp_type inttype;
1905 struct lp_build_context intbld;
1906 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1907 LLVMValueRef res, anosign, mask;
1908 LLVMTypeRef int_vec_type = bld->int_vec_type;
1909 LLVMTypeRef vec_type = bld->vec_type;
1910
1911 assert(type.width == 32); /* might want to handle doubles at some point */
1912
1913 inttype = type;
1914 inttype.floating = 0;
1915 lp_build_context_init(&intbld, bld->gallivm, inttype);
1916
1917 res = lp_build_iround(bld, a);
1918 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1919
1920 /* mask out sign bit */
1921 anosign = lp_build_abs(bld, a);
1922 /*
1923 * mask out all values if anosign > 2^24
1924 * This should work both for large ints (all rounding is no-op for them
1925 * because such floats are always exact) as well as special cases like
1926 * NaNs, Infs (taking advantage of the fact they use max exponent).
1927 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1928 */
1929 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1930 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1931 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1932 return lp_build_select(bld, mask, a, res);
1933 }
1934 }
1935
1936
1937 /**
1938 * Return floor of float (vector), result is a float (vector)
1939 * Ex: floor(1.1) = 1.0
1940 * Ex: floor(-1.1) = -2.0
1941 */
1942 LLVMValueRef
1943 lp_build_floor(struct lp_build_context *bld,
1944 LLVMValueRef a)
1945 {
1946 LLVMBuilderRef builder = bld->gallivm->builder;
1947 const struct lp_type type = bld->type;
1948
1949 assert(type.floating);
1950 assert(lp_check_value(type, a));
1951
1952 if (arch_rounding_available(type)) {
1953 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1954 }
1955 else {
1956 const struct lp_type type = bld->type;
1957 struct lp_type inttype;
1958 struct lp_build_context intbld;
1959 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1960 LLVMValueRef trunc, res, anosign, mask;
1961 LLVMTypeRef int_vec_type = bld->int_vec_type;
1962 LLVMTypeRef vec_type = bld->vec_type;
1963
1964 if (type.width != 32) {
1965 char intrinsic[32];
1966 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
1967 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1968 }
1969
1970 assert(type.width == 32); /* might want to handle doubles at some point */
1971
1972 inttype = type;
1973 inttype.floating = 0;
1974 lp_build_context_init(&intbld, bld->gallivm, inttype);
1975
1976 /* round by truncation */
1977 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1978 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1979
1980 if (type.sign) {
1981 LLVMValueRef tmp;
1982
1983 /*
1984 * fix values if rounding is wrong (for non-special cases)
1985 * - this is the case if trunc > a
1986 */
1987 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1988 /* tmp = trunc > a ? 1.0 : 0.0 */
1989 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1990 tmp = lp_build_and(&intbld, mask, tmp);
1991 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1992 res = lp_build_sub(bld, res, tmp);
1993 }
1994
1995 /* mask out sign bit */
1996 anosign = lp_build_abs(bld, a);
1997 /*
1998 * mask out all values if anosign > 2^24
1999 * This should work both for large ints (all rounding is no-op for them
2000 * because such floats are always exact) as well as special cases like
2001 * NaNs, Infs (taking advantage of the fact they use max exponent).
2002 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2003 */
2004 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2005 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2006 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2007 return lp_build_select(bld, mask, a, res);
2008 }
2009 }
2010
2011
2012 /**
2013 * Return ceiling of float (vector), returning float (vector).
2014 * Ex: ceil( 1.1) = 2.0
2015 * Ex: ceil(-1.1) = -1.0
2016 */
2017 LLVMValueRef
2018 lp_build_ceil(struct lp_build_context *bld,
2019 LLVMValueRef a)
2020 {
2021 LLVMBuilderRef builder = bld->gallivm->builder;
2022 const struct lp_type type = bld->type;
2023
2024 assert(type.floating);
2025 assert(lp_check_value(type, a));
2026
2027 if (arch_rounding_available(type)) {
2028 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2029 }
2030 else {
2031 const struct lp_type type = bld->type;
2032 struct lp_type inttype;
2033 struct lp_build_context intbld;
2034 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2035 LLVMValueRef trunc, res, anosign, mask, tmp;
2036 LLVMTypeRef int_vec_type = bld->int_vec_type;
2037 LLVMTypeRef vec_type = bld->vec_type;
2038
2039 if (type.width != 32) {
2040 char intrinsic[32];
2041 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2042 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2043 }
2044
2045 assert(type.width == 32); /* might want to handle doubles at some point */
2046
2047 inttype = type;
2048 inttype.floating = 0;
2049 lp_build_context_init(&intbld, bld->gallivm, inttype);
2050
2051 /* round by truncation */
2052 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2053 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2054
2055 /*
2056 * fix values if rounding is wrong (for non-special cases)
2057 * - this is the case if trunc < a
2058 */
2059 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2060 /* tmp = trunc < a ? 1.0 : 0.0 */
2061 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2062 tmp = lp_build_and(&intbld, mask, tmp);
2063 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2064 res = lp_build_add(bld, trunc, tmp);
2065
2066 /* mask out sign bit */
2067 anosign = lp_build_abs(bld, a);
2068 /*
2069 * mask out all values if anosign > 2^24
2070 * This should work both for large ints (all rounding is no-op for them
2071 * because such floats are always exact) as well as special cases like
2072 * NaNs, Infs (taking advantage of the fact they use max exponent).
2073 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2074 */
2075 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2076 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2077 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2078 return lp_build_select(bld, mask, a, res);
2079 }
2080 }
2081
2082
2083 /**
2084 * Return fractional part of 'a' computed as a - floor(a)
2085 * Typically used in texture coord arithmetic.
2086 */
2087 LLVMValueRef
2088 lp_build_fract(struct lp_build_context *bld,
2089 LLVMValueRef a)
2090 {
2091 assert(bld->type.floating);
2092 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2093 }
2094
2095
2096 /**
2097 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2098 * against 0.99999(9). (Will also return that value for NaNs.)
2099 */
2100 static inline LLVMValueRef
2101 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2102 {
2103 LLVMValueRef max;
2104
2105 /* this is the largest number smaller than 1.0 representable as float */
2106 max = lp_build_const_vec(bld->gallivm, bld->type,
2107 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2108 return lp_build_min_ext(bld, fract, max,
2109 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2110 }
2111
2112
2113 /**
2114 * Same as lp_build_fract, but guarantees that the result is always smaller
2115 * than one. Will also return the smaller-than-one value for infs, NaNs.
2116 */
2117 LLVMValueRef
2118 lp_build_fract_safe(struct lp_build_context *bld,
2119 LLVMValueRef a)
2120 {
2121 return clamp_fract(bld, lp_build_fract(bld, a));
2122 }
2123
2124
2125 /**
2126 * Return the integer part of a float (vector) value (== round toward zero).
2127 * The returned value is an integer (vector).
2128 * Ex: itrunc(-1.5) = -1
2129 */
2130 LLVMValueRef
2131 lp_build_itrunc(struct lp_build_context *bld,
2132 LLVMValueRef a)
2133 {
2134 LLVMBuilderRef builder = bld->gallivm->builder;
2135 const struct lp_type type = bld->type;
2136 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2137
2138 assert(type.floating);
2139 assert(lp_check_value(type, a));
2140
2141 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2142 }
2143
2144
2145 /**
2146 * Return float (vector) rounded to nearest integer (vector). The returned
2147 * value is an integer (vector).
2148 * Ex: iround(0.9) = 1
2149 * Ex: iround(-1.5) = -2
2150 */
2151 LLVMValueRef
2152 lp_build_iround(struct lp_build_context *bld,
2153 LLVMValueRef a)
2154 {
2155 LLVMBuilderRef builder = bld->gallivm->builder;
2156 const struct lp_type type = bld->type;
2157 LLVMTypeRef int_vec_type = bld->int_vec_type;
2158 LLVMValueRef res;
2159
2160 assert(type.floating);
2161
2162 assert(lp_check_value(type, a));
2163
2164 if ((util_cpu_caps.has_sse2 &&
2165 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2166 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2167 return lp_build_iround_nearest_sse2(bld, a);
2168 }
2169 if (arch_rounding_available(type)) {
2170 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2171 }
2172 else {
2173 LLVMValueRef half;
2174
2175 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2176
2177 if (type.sign) {
2178 LLVMTypeRef vec_type = bld->vec_type;
2179 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2180 (unsigned long long)1 << (type.width - 1));
2181 LLVMValueRef sign;
2182
2183 /* get sign bit */
2184 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2185 sign = LLVMBuildAnd(builder, sign, mask, "");
2186
2187 /* sign * 0.5 */
2188 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2189 half = LLVMBuildOr(builder, sign, half, "");
2190 half = LLVMBuildBitCast(builder, half, vec_type, "");
2191 }
2192
2193 res = LLVMBuildFAdd(builder, a, half, "");
2194 }
2195
2196 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2197
2198 return res;
2199 }
2200
2201
2202 /**
2203 * Return floor of float (vector), result is an int (vector)
2204 * Ex: ifloor(1.1) = 1.0
2205 * Ex: ifloor(-1.1) = -2.0
2206 */
2207 LLVMValueRef
2208 lp_build_ifloor(struct lp_build_context *bld,
2209 LLVMValueRef a)
2210 {
2211 LLVMBuilderRef builder = bld->gallivm->builder;
2212 const struct lp_type type = bld->type;
2213 LLVMTypeRef int_vec_type = bld->int_vec_type;
2214 LLVMValueRef res;
2215
2216 assert(type.floating);
2217 assert(lp_check_value(type, a));
2218
2219 res = a;
2220 if (type.sign) {
2221 if (arch_rounding_available(type)) {
2222 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2223 }
2224 else {
2225 struct lp_type inttype;
2226 struct lp_build_context intbld;
2227 LLVMValueRef trunc, itrunc, mask;
2228
2229 assert(type.floating);
2230 assert(lp_check_value(type, a));
2231
2232 inttype = type;
2233 inttype.floating = 0;
2234 lp_build_context_init(&intbld, bld->gallivm, inttype);
2235
2236 /* round by truncation */
2237 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2238 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2239
2240 /*
2241 * fix values if rounding is wrong (for non-special cases)
2242 * - this is the case if trunc > a
2243 * The results of doing this with NaNs, very large values etc.
2244 * are undefined but this seems to be the case anyway.
2245 */
2246 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2247 /* cheapie minus one with mask since the mask is minus one / zero */
2248 return lp_build_add(&intbld, itrunc, mask);
2249 }
2250 }
2251
2252 /* round to nearest (toward zero) */
2253 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2254
2255 return res;
2256 }
2257
2258
2259 /**
2260 * Return ceiling of float (vector), returning int (vector).
2261 * Ex: iceil( 1.1) = 2
2262 * Ex: iceil(-1.1) = -1
2263 */
2264 LLVMValueRef
2265 lp_build_iceil(struct lp_build_context *bld,
2266 LLVMValueRef a)
2267 {
2268 LLVMBuilderRef builder = bld->gallivm->builder;
2269 const struct lp_type type = bld->type;
2270 LLVMTypeRef int_vec_type = bld->int_vec_type;
2271 LLVMValueRef res;
2272
2273 assert(type.floating);
2274 assert(lp_check_value(type, a));
2275
2276 if (arch_rounding_available(type)) {
2277 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2278 }
2279 else {
2280 struct lp_type inttype;
2281 struct lp_build_context intbld;
2282 LLVMValueRef trunc, itrunc, mask;
2283
2284 assert(type.floating);
2285 assert(lp_check_value(type, a));
2286
2287 inttype = type;
2288 inttype.floating = 0;
2289 lp_build_context_init(&intbld, bld->gallivm, inttype);
2290
2291 /* round by truncation */
2292 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2293 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2294
2295 /*
2296 * fix values if rounding is wrong (for non-special cases)
2297 * - this is the case if trunc < a
2298 * The results of doing this with NaNs, very large values etc.
2299 * are undefined but this seems to be the case anyway.
2300 */
2301 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2302 /* cheapie plus one with mask since the mask is minus one / zero */
2303 return lp_build_sub(&intbld, itrunc, mask);
2304 }
2305
2306 /* round to nearest (toward zero) */
2307 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2308
2309 return res;
2310 }
2311
2312
2313 /**
2314 * Combined ifloor() & fract().
2315 *
2316 * Preferred to calling the functions separately, as it will ensure that the
2317 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2318 */
2319 void
2320 lp_build_ifloor_fract(struct lp_build_context *bld,
2321 LLVMValueRef a,
2322 LLVMValueRef *out_ipart,
2323 LLVMValueRef *out_fpart)
2324 {
2325 LLVMBuilderRef builder = bld->gallivm->builder;
2326 const struct lp_type type = bld->type;
2327 LLVMValueRef ipart;
2328
2329 assert(type.floating);
2330 assert(lp_check_value(type, a));
2331
2332 if (arch_rounding_available(type)) {
2333 /*
2334 * floor() is easier.
2335 */
2336
2337 ipart = lp_build_floor(bld, a);
2338 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2339 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2340 }
2341 else {
2342 /*
2343 * ifloor() is easier.
2344 */
2345
2346 *out_ipart = lp_build_ifloor(bld, a);
2347 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2348 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2349 }
2350 }
2351
2352
2353 /**
2354 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2355 * always smaller than one.
2356 */
2357 void
2358 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2359 LLVMValueRef a,
2360 LLVMValueRef *out_ipart,
2361 LLVMValueRef *out_fpart)
2362 {
2363 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2364 *out_fpart = clamp_fract(bld, *out_fpart);
2365 }
2366
2367
2368 LLVMValueRef
2369 lp_build_sqrt(struct lp_build_context *bld,
2370 LLVMValueRef a)
2371 {
2372 LLVMBuilderRef builder = bld->gallivm->builder;
2373 const struct lp_type type = bld->type;
2374 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2375 char intrinsic[32];
2376
2377 assert(lp_check_value(type, a));
2378
2379 assert(type.floating);
2380 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2381
2382 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2383 }
2384
2385
2386 /**
2387 * Do one Newton-Raphson step to improve reciprocate precision:
2388 *
2389 * x_{i+1} = x_i * (2 - a * x_i)
2390 *
2391 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2392 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2393 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2394 * halo. It would be necessary to clamp the argument to prevent this.
2395 *
2396 * See also:
2397 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2398 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2399 */
2400 static inline LLVMValueRef
2401 lp_build_rcp_refine(struct lp_build_context *bld,
2402 LLVMValueRef a,
2403 LLVMValueRef rcp_a)
2404 {
2405 LLVMBuilderRef builder = bld->gallivm->builder;
2406 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2407 LLVMValueRef res;
2408
2409 res = LLVMBuildFMul(builder, a, rcp_a, "");
2410 res = LLVMBuildFSub(builder, two, res, "");
2411 res = LLVMBuildFMul(builder, rcp_a, res, "");
2412
2413 return res;
2414 }
2415
2416
2417 LLVMValueRef
2418 lp_build_rcp(struct lp_build_context *bld,
2419 LLVMValueRef a)
2420 {
2421 LLVMBuilderRef builder = bld->gallivm->builder;
2422 const struct lp_type type = bld->type;
2423
2424 assert(lp_check_value(type, a));
2425
2426 if(a == bld->zero)
2427 return bld->undef;
2428 if(a == bld->one)
2429 return bld->one;
2430 if(a == bld->undef)
2431 return bld->undef;
2432
2433 assert(type.floating);
2434
2435 if(LLVMIsConstant(a))
2436 return LLVMConstFDiv(bld->one, a);
2437
2438 /*
2439 * We don't use RCPPS because:
2440 * - it only has 10bits of precision
2441 * - it doesn't even get the reciprocate of 1.0 exactly
2442 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2443 * - for recent processors the benefit over DIVPS is marginal, a case
2444 * dependent
2445 *
2446 * We could still use it on certain processors if benchmarks show that the
2447 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2448 * particular uses that require less workarounds.
2449 */
2450
2451 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2452 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2453 const unsigned num_iterations = 0;
2454 LLVMValueRef res;
2455 unsigned i;
2456 const char *intrinsic = NULL;
2457
2458 if (type.length == 4) {
2459 intrinsic = "llvm.x86.sse.rcp.ps";
2460 }
2461 else {
2462 intrinsic = "llvm.x86.avx.rcp.ps.256";
2463 }
2464
2465 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2466
2467 for (i = 0; i < num_iterations; ++i) {
2468 res = lp_build_rcp_refine(bld, a, res);
2469 }
2470
2471 return res;
2472 }
2473
2474 return LLVMBuildFDiv(builder, bld->one, a, "");
2475 }
2476
2477
2478 /**
2479 * Do one Newton-Raphson step to improve rsqrt precision:
2480 *
2481 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2482 *
2483 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2484 */
2485 static inline LLVMValueRef
2486 lp_build_rsqrt_refine(struct lp_build_context *bld,
2487 LLVMValueRef a,
2488 LLVMValueRef rsqrt_a)
2489 {
2490 LLVMBuilderRef builder = bld->gallivm->builder;
2491 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2492 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2493 LLVMValueRef res;
2494
2495 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2496 res = LLVMBuildFMul(builder, a, res, "");
2497 res = LLVMBuildFSub(builder, three, res, "");
2498 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2499 res = LLVMBuildFMul(builder, half, res, "");
2500
2501 return res;
2502 }
2503
2504
2505 /**
2506 * Generate 1/sqrt(a).
2507 * Result is undefined for values < 0, infinity for +0.
2508 */
2509 LLVMValueRef
2510 lp_build_rsqrt(struct lp_build_context *bld,
2511 LLVMValueRef a)
2512 {
2513 const struct lp_type type = bld->type;
2514
2515 assert(lp_check_value(type, a));
2516
2517 assert(type.floating);
2518
2519 /*
2520 * This should be faster but all denormals will end up as infinity.
2521 */
2522 if (0 && lp_build_fast_rsqrt_available(type)) {
2523 const unsigned num_iterations = 1;
2524 LLVMValueRef res;
2525 unsigned i;
2526
2527 /* rsqrt(1.0) != 1.0 here */
2528 res = lp_build_fast_rsqrt(bld, a);
2529
2530 if (num_iterations) {
2531 /*
2532 * Newton-Raphson will result in NaN instead of infinity for zero,
2533 * and NaN instead of zero for infinity.
2534 * Also, need to ensure rsqrt(1.0) == 1.0.
2535 * All numbers smaller than FLT_MIN will result in +infinity
2536 * (rsqrtps treats all denormals as zero).
2537 */
2538 LLVMValueRef cmp;
2539 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2540 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2541
2542 for (i = 0; i < num_iterations; ++i) {
2543 res = lp_build_rsqrt_refine(bld, a, res);
2544 }
2545 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2546 res = lp_build_select(bld, cmp, inf, res);
2547 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2548 res = lp_build_select(bld, cmp, bld->zero, res);
2549 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2550 res = lp_build_select(bld, cmp, bld->one, res);
2551 }
2552
2553 return res;
2554 }
2555
2556 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2557 }
2558
2559 /**
2560 * If there's a fast (inaccurate) rsqrt instruction available
2561 * (caller may want to avoid to call rsqrt_fast if it's not available,
2562 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2563 * unavailable it would result in sqrt/div/mul so obviously
2564 * much better to just call sqrt, skipping both div and mul).
2565 */
2566 boolean
2567 lp_build_fast_rsqrt_available(struct lp_type type)
2568 {
2569 assert(type.floating);
2570
2571 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2572 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2573 return true;
2574 }
2575 return false;
2576 }
2577
2578
2579 /**
2580 * Generate 1/sqrt(a).
2581 * Result is undefined for values < 0, infinity for +0.
2582 * Precision is limited, only ~10 bits guaranteed
2583 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2584 */
2585 LLVMValueRef
2586 lp_build_fast_rsqrt(struct lp_build_context *bld,
2587 LLVMValueRef a)
2588 {
2589 LLVMBuilderRef builder = bld->gallivm->builder;
2590 const struct lp_type type = bld->type;
2591
2592 assert(lp_check_value(type, a));
2593
2594 if (lp_build_fast_rsqrt_available(type)) {
2595 const char *intrinsic = NULL;
2596
2597 if (type.length == 4) {
2598 intrinsic = "llvm.x86.sse.rsqrt.ps";
2599 }
2600 else {
2601 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2602 }
2603 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2604 }
2605 else {
2606 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2607 }
2608 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2609 }
2610
2611
2612 /**
2613 * Generate sin(a) or cos(a) using polynomial approximation.
2614 * TODO: it might be worth recognizing sin and cos using same source
2615 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2616 * would be way cheaper than calculating (nearly) everything twice...
2617 * Not sure it's common enough to be worth bothering however, scs
2618 * opcode could also benefit from calculating both though.
2619 */
2620 static LLVMValueRef
2621 lp_build_sin_or_cos(struct lp_build_context *bld,
2622 LLVMValueRef a,
2623 boolean cos)
2624 {
2625 struct gallivm_state *gallivm = bld->gallivm;
2626 LLVMBuilderRef b = gallivm->builder;
2627 struct lp_type int_type = lp_int_type(bld->type);
2628
2629 /*
2630 * take the absolute value,
2631 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2632 */
2633
2634 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2635 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2636
2637 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2638 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2639
2640 /*
2641 * scale by 4/Pi
2642 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2643 */
2644
2645 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2646 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2647
2648 /*
2649 * store the integer part of y in mm0
2650 * emm2 = _mm_cvttps_epi32(y);
2651 */
2652
2653 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2654
2655 /*
2656 * j=(j+1) & (~1) (see the cephes sources)
2657 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2658 */
2659
2660 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2661 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2662 /*
2663 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2664 */
2665 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2666 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2667
2668 /*
2669 * y = _mm_cvtepi32_ps(emm2);
2670 */
2671 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2672
2673 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2674 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2675 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2676 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2677
2678 /*
2679 * Argument used for poly selection and sign bit determination
2680 * is different for sin vs. cos.
2681 */
2682 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2683 emm2_and;
2684
2685 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2686 LLVMBuildNot(b, emm2_2, ""), ""),
2687 const_29, "sign_bit") :
2688 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2689 LLVMBuildShl(b, emm2_add,
2690 const_29, ""), ""),
2691 sign_mask, "sign_bit");
2692
2693 /*
2694 * get the polynom selection mask
2695 * there is one polynom for 0 <= x <= Pi/4
2696 * and another one for Pi/4<x<=Pi/2
2697 * Both branches will be computed.
2698 *
2699 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2700 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2701 */
2702
2703 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2704 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2705 int_type, PIPE_FUNC_EQUAL,
2706 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2707
2708 /*
2709 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2710 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2711 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2712 */
2713 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2714 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2715 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2716
2717 /*
2718 * The magic pass: "Extended precision modular arithmetic"
2719 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2720 * xmm1 = _mm_mul_ps(y, xmm1);
2721 * xmm2 = _mm_mul_ps(y, xmm2);
2722 * xmm3 = _mm_mul_ps(y, xmm3);
2723 */
2724 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2725 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2726 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2727
2728 /*
2729 * x = _mm_add_ps(x, xmm1);
2730 * x = _mm_add_ps(x, xmm2);
2731 * x = _mm_add_ps(x, xmm3);
2732 */
2733
2734 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2735 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2736 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2737
2738 /*
2739 * Evaluate the first polynom (0 <= x <= Pi/4)
2740 *
2741 * z = _mm_mul_ps(x,x);
2742 */
2743 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2744
2745 /*
2746 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2747 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2748 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2749 */
2750 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2751 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2752 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2753
2754 /*
2755 * y = *(v4sf*)_ps_coscof_p0;
2756 * y = _mm_mul_ps(y, z);
2757 */
2758 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2759 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2760 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2761 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2762 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2763 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2764
2765
2766 /*
2767 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2768 * y = _mm_sub_ps(y, tmp);
2769 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2770 */
2771 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2772 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2773 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2774 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2775 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2776
2777 /*
2778 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2779 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2780 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2781 */
2782 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2783 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2784 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2785
2786 /*
2787 * Evaluate the second polynom (Pi/4 <= x <= 0)
2788 *
2789 * y2 = *(v4sf*)_ps_sincof_p0;
2790 * y2 = _mm_mul_ps(y2, z);
2791 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2792 * y2 = _mm_mul_ps(y2, z);
2793 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2794 * y2 = _mm_mul_ps(y2, z);
2795 * y2 = _mm_mul_ps(y2, x);
2796 * y2 = _mm_add_ps(y2, x);
2797 */
2798
2799 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2800 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2801 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2802 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2803 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2804 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2805 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2806
2807 /*
2808 * select the correct result from the two polynoms
2809 * xmm3 = poly_mask;
2810 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2811 * y = _mm_andnot_ps(xmm3, y);
2812 * y = _mm_or_ps(y,y2);
2813 */
2814 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2815 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2816 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2817 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2818 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2819 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2820
2821 /*
2822 * update the sign
2823 * y = _mm_xor_ps(y, sign_bit);
2824 */
2825 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2826 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2827
2828 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2829
2830 /* clamp output to be within [-1, 1] */
2831 y_result = lp_build_clamp(bld, y_result,
2832 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2833 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2834 /* If a is -inf, inf or NaN then return NaN */
2835 y_result = lp_build_select(bld, isfinite, y_result,
2836 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2837 return y_result;
2838 }
2839
2840
2841 /**
2842 * Generate sin(a)
2843 */
2844 LLVMValueRef
2845 lp_build_sin(struct lp_build_context *bld,
2846 LLVMValueRef a)
2847 {
2848 return lp_build_sin_or_cos(bld, a, FALSE);
2849 }
2850
2851
2852 /**
2853 * Generate cos(a)
2854 */
2855 LLVMValueRef
2856 lp_build_cos(struct lp_build_context *bld,
2857 LLVMValueRef a)
2858 {
2859 return lp_build_sin_or_cos(bld, a, TRUE);
2860 }
2861
2862
2863 /**
2864 * Generate pow(x, y)
2865 */
2866 LLVMValueRef
2867 lp_build_pow(struct lp_build_context *bld,
2868 LLVMValueRef x,
2869 LLVMValueRef y)
2870 {
2871 /* TODO: optimize the constant case */
2872 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2873 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2874 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2875 __FUNCTION__);
2876 }
2877
2878 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2879 }
2880
2881
2882 /**
2883 * Generate exp(x)
2884 */
2885 LLVMValueRef
2886 lp_build_exp(struct lp_build_context *bld,
2887 LLVMValueRef x)
2888 {
2889 /* log2(e) = 1/log(2) */
2890 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2891 1.4426950408889634);
2892
2893 assert(lp_check_value(bld->type, x));
2894
2895 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2896 }
2897
2898
2899 /**
2900 * Generate log(x)
2901 * Behavior is undefined with infs, 0s and nans
2902 */
2903 LLVMValueRef
2904 lp_build_log(struct lp_build_context *bld,
2905 LLVMValueRef x)
2906 {
2907 /* log(2) */
2908 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2909 0.69314718055994529);
2910
2911 assert(lp_check_value(bld->type, x));
2912
2913 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2914 }
2915
2916 /**
2917 * Generate log(x) that handles edge cases (infs, 0s and nans)
2918 */
2919 LLVMValueRef
2920 lp_build_log_safe(struct lp_build_context *bld,
2921 LLVMValueRef x)
2922 {
2923 /* log(2) */
2924 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2925 0.69314718055994529);
2926
2927 assert(lp_check_value(bld->type, x));
2928
2929 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2930 }
2931
2932
2933 /**
2934 * Generate polynomial.
2935 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2936 */
2937 LLVMValueRef
2938 lp_build_polynomial(struct lp_build_context *bld,
2939 LLVMValueRef x,
2940 const double *coeffs,
2941 unsigned num_coeffs)
2942 {
2943 const struct lp_type type = bld->type;
2944 LLVMValueRef even = NULL, odd = NULL;
2945 LLVMValueRef x2;
2946 unsigned i;
2947
2948 assert(lp_check_value(bld->type, x));
2949
2950 /* TODO: optimize the constant case */
2951 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2952 LLVMIsConstant(x)) {
2953 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2954 __FUNCTION__);
2955 }
2956
2957 /*
2958 * Calculate odd and even terms seperately to decrease data dependency
2959 * Ex:
2960 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2961 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2962 */
2963 x2 = lp_build_mul(bld, x, x);
2964
2965 for (i = num_coeffs; i--; ) {
2966 LLVMValueRef coeff;
2967
2968 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2969
2970 if (i % 2 == 0) {
2971 if (even)
2972 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2973 else
2974 even = coeff;
2975 } else {
2976 if (odd)
2977 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2978 else
2979 odd = coeff;
2980 }
2981 }
2982
2983 if (odd)
2984 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2985 else if (even)
2986 return even;
2987 else
2988 return bld->undef;
2989 }
2990
2991
2992 /**
2993 * Minimax polynomial fit of 2**x, in range [0, 1[
2994 */
2995 const double lp_build_exp2_polynomial[] = {
2996 #if EXP_POLY_DEGREE == 5
2997 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
2998 0.693153073200168932794,
2999 0.240153617044375388211,
3000 0.0558263180532956664775,
3001 0.00898934009049466391101,
3002 0.00187757667519147912699
3003 #elif EXP_POLY_DEGREE == 4
3004 1.00000259337069434683,
3005 0.693003834469974940458,
3006 0.24144275689150793076,
3007 0.0520114606103070150235,
3008 0.0135341679161270268764
3009 #elif EXP_POLY_DEGREE == 3
3010 0.999925218562710312959,
3011 0.695833540494823811697,
3012 0.226067155427249155588,
3013 0.0780245226406372992967
3014 #elif EXP_POLY_DEGREE == 2
3015 1.00172476321474503578,
3016 0.657636275736077639316,
3017 0.33718943461968720704
3018 #else
3019 #error
3020 #endif
3021 };
3022
3023
3024 LLVMValueRef
3025 lp_build_exp2(struct lp_build_context *bld,
3026 LLVMValueRef x)
3027 {
3028 LLVMBuilderRef builder = bld->gallivm->builder;
3029 const struct lp_type type = bld->type;
3030 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3031 LLVMValueRef ipart = NULL;
3032 LLVMValueRef fpart = NULL;
3033 LLVMValueRef expipart = NULL;
3034 LLVMValueRef expfpart = NULL;
3035 LLVMValueRef res = NULL;
3036
3037 assert(lp_check_value(bld->type, x));
3038
3039 /* TODO: optimize the constant case */
3040 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3041 LLVMIsConstant(x)) {
3042 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3043 __FUNCTION__);
3044 }
3045
3046 assert(type.floating && type.width == 32);
3047
3048 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3049 * the result is INF and if it's smaller than -126.9 the result is 0 */
3050 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3051 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3052 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3053 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3054
3055 /* ipart = floor(x) */
3056 /* fpart = x - ipart */
3057 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3058
3059 /* expipart = (float) (1 << ipart) */
3060 expipart = LLVMBuildAdd(builder, ipart,
3061 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3062 expipart = LLVMBuildShl(builder, expipart,
3063 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3064 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3065
3066 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3067 ARRAY_SIZE(lp_build_exp2_polynomial));
3068
3069 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3070
3071 return res;
3072 }
3073
3074
3075
3076 /**
3077 * Extract the exponent of a IEEE-754 floating point value.
3078 *
3079 * Optionally apply an integer bias.
3080 *
3081 * Result is an integer value with
3082 *
3083 * ifloor(log2(x)) + bias
3084 */
3085 LLVMValueRef
3086 lp_build_extract_exponent(struct lp_build_context *bld,
3087 LLVMValueRef x,
3088 int bias)
3089 {
3090 LLVMBuilderRef builder = bld->gallivm->builder;
3091 const struct lp_type type = bld->type;
3092 unsigned mantissa = lp_mantissa(type);
3093 LLVMValueRef res;
3094
3095 assert(type.floating);
3096
3097 assert(lp_check_value(bld->type, x));
3098
3099 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3100
3101 res = LLVMBuildLShr(builder, x,
3102 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3103 res = LLVMBuildAnd(builder, res,
3104 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3105 res = LLVMBuildSub(builder, res,
3106 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3107
3108 return res;
3109 }
3110
3111
3112 /**
3113 * Extract the mantissa of the a floating.
3114 *
3115 * Result is a floating point value with
3116 *
3117 * x / floor(log2(x))
3118 */
3119 LLVMValueRef
3120 lp_build_extract_mantissa(struct lp_build_context *bld,
3121 LLVMValueRef x)
3122 {
3123 LLVMBuilderRef builder = bld->gallivm->builder;
3124 const struct lp_type type = bld->type;
3125 unsigned mantissa = lp_mantissa(type);
3126 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3127 (1ULL << mantissa) - 1);
3128 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3129 LLVMValueRef res;
3130
3131 assert(lp_check_value(bld->type, x));
3132
3133 assert(type.floating);
3134
3135 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3136
3137 /* res = x / 2**ipart */
3138 res = LLVMBuildAnd(builder, x, mantmask, "");
3139 res = LLVMBuildOr(builder, res, one, "");
3140 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3141
3142 return res;
3143 }
3144
3145
3146
3147 /**
3148 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3149 * These coefficients can be generate with
3150 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3151 */
3152 const double lp_build_log2_polynomial[] = {
3153 #if LOG_POLY_DEGREE == 5
3154 2.88539008148777786488L,
3155 0.961796878841293367824L,
3156 0.577058946784739859012L,
3157 0.412914355135828735411L,
3158 0.308591899232910175289L,
3159 0.352376952300281371868L,
3160 #elif LOG_POLY_DEGREE == 4
3161 2.88539009343309178325L,
3162 0.961791550404184197881L,
3163 0.577440339438736392009L,
3164 0.403343858251329912514L,
3165 0.406718052498846252698L,
3166 #elif LOG_POLY_DEGREE == 3
3167 2.88538959748872753838L,
3168 0.961932915889597772928L,
3169 0.571118517972136195241L,
3170 0.493997535084709500285L,
3171 #else
3172 #error
3173 #endif
3174 };
3175
3176 /**
3177 * See http://www.devmaster.net/forums/showthread.php?p=43580
3178 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3179 * http://www.nezumi.demon.co.uk/consult/logx.htm
3180 *
3181 * If handle_edge_cases is true the function will perform computations
3182 * to match the required D3D10+ behavior for each of the edge cases.
3183 * That means that if input is:
3184 * - less than zero (to and including -inf) then NaN will be returned
3185 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3186 * - +infinity, then +infinity will be returned
3187 * - NaN, then NaN will be returned
3188 *
3189 * Those checks are fairly expensive so if you don't need them make sure
3190 * handle_edge_cases is false.
3191 */
3192 void
3193 lp_build_log2_approx(struct lp_build_context *bld,
3194 LLVMValueRef x,
3195 LLVMValueRef *p_exp,
3196 LLVMValueRef *p_floor_log2,
3197 LLVMValueRef *p_log2,
3198 boolean handle_edge_cases)
3199 {
3200 LLVMBuilderRef builder = bld->gallivm->builder;
3201 const struct lp_type type = bld->type;
3202 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3203 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3204
3205 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3206 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3207 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3208
3209 LLVMValueRef i = NULL;
3210 LLVMValueRef y = NULL;
3211 LLVMValueRef z = NULL;
3212 LLVMValueRef exp = NULL;
3213 LLVMValueRef mant = NULL;
3214 LLVMValueRef logexp = NULL;
3215 LLVMValueRef logmant = NULL;
3216 LLVMValueRef res = NULL;
3217
3218 assert(lp_check_value(bld->type, x));
3219
3220 if(p_exp || p_floor_log2 || p_log2) {
3221 /* TODO: optimize the constant case */
3222 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3223 LLVMIsConstant(x)) {
3224 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3225 __FUNCTION__);
3226 }
3227
3228 assert(type.floating && type.width == 32);
3229
3230 /*
3231 * We don't explicitly handle denormalized numbers. They will yield a
3232 * result in the neighbourhood of -127, which appears to be adequate
3233 * enough.
3234 */
3235
3236 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3237
3238 /* exp = (float) exponent(x) */
3239 exp = LLVMBuildAnd(builder, i, expmask, "");
3240 }
3241
3242 if(p_floor_log2 || p_log2) {
3243 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3244 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3245 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3246 }
3247
3248 if (p_log2) {
3249 /* mant = 1 + (float) mantissa(x) */
3250 mant = LLVMBuildAnd(builder, i, mantmask, "");
3251 mant = LLVMBuildOr(builder, mant, one, "");
3252 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3253
3254 /* y = (mant - 1) / (mant + 1) */
3255 y = lp_build_div(bld,
3256 lp_build_sub(bld, mant, bld->one),
3257 lp_build_add(bld, mant, bld->one)
3258 );
3259
3260 /* z = y^2 */
3261 z = lp_build_mul(bld, y, y);
3262
3263 /* compute P(z) */
3264 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3265 ARRAY_SIZE(lp_build_log2_polynomial));
3266
3267 /* logmant = y * P(z) */
3268 logmant = lp_build_mul(bld, y, logmant);
3269
3270 res = lp_build_add(bld, logmant, logexp);
3271
3272 if (type.floating && handle_edge_cases) {
3273 LLVMValueRef negmask, infmask, zmask;
3274 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3275 lp_build_const_vec(bld->gallivm, type, 0.0f));
3276 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3277 lp_build_const_vec(bld->gallivm, type, 0.0f));
3278 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3279 lp_build_const_vec(bld->gallivm, type, INFINITY));
3280
3281 /* If x is qual to inf make sure we return inf */
3282 res = lp_build_select(bld, infmask,
3283 lp_build_const_vec(bld->gallivm, type, INFINITY),
3284 res);
3285 /* If x is qual to 0, return -inf */
3286 res = lp_build_select(bld, zmask,
3287 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3288 res);
3289 /* If x is nan or less than 0, return nan */
3290 res = lp_build_select(bld, negmask,
3291 lp_build_const_vec(bld->gallivm, type, NAN),
3292 res);
3293 }
3294 }
3295
3296 if (p_exp) {
3297 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3298 *p_exp = exp;
3299 }
3300
3301 if (p_floor_log2)
3302 *p_floor_log2 = logexp;
3303
3304 if (p_log2)
3305 *p_log2 = res;
3306 }
3307
3308
3309 /*
3310 * log2 implementation which doesn't have special code to
3311 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3312 * the results for those cases are undefined.
3313 */
3314 LLVMValueRef
3315 lp_build_log2(struct lp_build_context *bld,
3316 LLVMValueRef x)
3317 {
3318 LLVMValueRef res;
3319 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3320 return res;
3321 }
3322
3323 /*
3324 * Version of log2 which handles all edge cases.
3325 * Look at documentation of lp_build_log2_approx for
3326 * description of the behavior for each of the edge cases.
3327 */
3328 LLVMValueRef
3329 lp_build_log2_safe(struct lp_build_context *bld,
3330 LLVMValueRef x)
3331 {
3332 LLVMValueRef res;
3333 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3334 return res;
3335 }
3336
3337
3338 /**
3339 * Faster (and less accurate) log2.
3340 *
3341 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3342 *
3343 * Piece-wise linear approximation, with exact results when x is a
3344 * power of two.
3345 *
3346 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3347 */
3348 LLVMValueRef
3349 lp_build_fast_log2(struct lp_build_context *bld,
3350 LLVMValueRef x)
3351 {
3352 LLVMBuilderRef builder = bld->gallivm->builder;
3353 LLVMValueRef ipart;
3354 LLVMValueRef fpart;
3355
3356 assert(lp_check_value(bld->type, x));
3357
3358 assert(bld->type.floating);
3359
3360 /* ipart = floor(log2(x)) - 1 */
3361 ipart = lp_build_extract_exponent(bld, x, -1);
3362 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3363
3364 /* fpart = x / 2**ipart */
3365 fpart = lp_build_extract_mantissa(bld, x);
3366
3367 /* ipart + fpart */
3368 return LLVMBuildFAdd(builder, ipart, fpart, "");
3369 }
3370
3371
3372 /**
3373 * Fast implementation of iround(log2(x)).
3374 *
3375 * Not an approximation -- it should give accurate results all the time.
3376 */
3377 LLVMValueRef
3378 lp_build_ilog2(struct lp_build_context *bld,
3379 LLVMValueRef x)
3380 {
3381 LLVMBuilderRef builder = bld->gallivm->builder;
3382 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3383 LLVMValueRef ipart;
3384
3385 assert(bld->type.floating);
3386
3387 assert(lp_check_value(bld->type, x));
3388
3389 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3390 x = LLVMBuildFMul(builder, x, sqrt2, "");
3391
3392 /* ipart = floor(log2(x) + 0.5) */
3393 ipart = lp_build_extract_exponent(bld, x, 0);
3394
3395 return ipart;
3396 }
3397
3398 LLVMValueRef
3399 lp_build_mod(struct lp_build_context *bld,
3400 LLVMValueRef x,
3401 LLVMValueRef y)
3402 {
3403 LLVMBuilderRef builder = bld->gallivm->builder;
3404 LLVMValueRef res;
3405 const struct lp_type type = bld->type;
3406
3407 assert(lp_check_value(type, x));
3408 assert(lp_check_value(type, y));
3409
3410 if (type.floating)
3411 res = LLVMBuildFRem(builder, x, y, "");
3412 else if (type.sign)
3413 res = LLVMBuildSRem(builder, x, y, "");
3414 else
3415 res = LLVMBuildURem(builder, x, y, "");
3416 return res;
3417 }
3418
3419
3420 /*
3421 * For floating inputs it creates and returns a mask
3422 * which is all 1's for channels which are NaN.
3423 * Channels inside x which are not NaN will be 0.
3424 */
3425 LLVMValueRef
3426 lp_build_isnan(struct lp_build_context *bld,
3427 LLVMValueRef x)
3428 {
3429 LLVMValueRef mask;
3430 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3431
3432 assert(bld->type.floating);
3433 assert(lp_check_value(bld->type, x));
3434
3435 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3436 "isnotnan");
3437 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3438 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3439 return mask;
3440 }
3441
3442 /* Returns all 1's for floating point numbers that are
3443 * finite numbers and returns all zeros for -inf,
3444 * inf and nan's */
3445 LLVMValueRef
3446 lp_build_isfinite(struct lp_build_context *bld,
3447 LLVMValueRef x)
3448 {
3449 LLVMBuilderRef builder = bld->gallivm->builder;
3450 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3451 struct lp_type int_type = lp_int_type(bld->type);
3452 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3453 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3454 0x7f800000);
3455
3456 if (!bld->type.floating) {
3457 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3458 }
3459 assert(bld->type.floating);
3460 assert(lp_check_value(bld->type, x));
3461 assert(bld->type.width == 32);
3462
3463 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3464 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3465 intx, infornan32);
3466 }
3467
3468 /*
3469 * Returns true if the number is nan or inf and false otherwise.
3470 * The input has to be a floating point vector.
3471 */
3472 LLVMValueRef
3473 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3474 const struct lp_type type,
3475 LLVMValueRef x)
3476 {
3477 LLVMBuilderRef builder = gallivm->builder;
3478 struct lp_type int_type = lp_int_type(type);
3479 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3480 0x7f800000);
3481 LLVMValueRef ret;
3482
3483 assert(type.floating);
3484
3485 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3486 ret = LLVMBuildAnd(builder, ret, const0, "");
3487 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3488 ret, const0);
3489
3490 return ret;
3491 }
3492
3493
3494 LLVMValueRef
3495 lp_build_fpstate_get(struct gallivm_state *gallivm)
3496 {
3497 if (util_cpu_caps.has_sse) {
3498 LLVMBuilderRef builder = gallivm->builder;
3499 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3500 gallivm,
3501 LLVMInt32TypeInContext(gallivm->context),
3502 "mxcsr_ptr");
3503 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3504 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3505 lp_build_intrinsic(builder,
3506 "llvm.x86.sse.stmxcsr",
3507 LLVMVoidTypeInContext(gallivm->context),
3508 &mxcsr_ptr8, 1, 0);
3509 return mxcsr_ptr;
3510 }
3511 return 0;
3512 }
3513
3514 void
3515 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3516 boolean zero)
3517 {
3518 if (util_cpu_caps.has_sse) {
3519 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3520 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3521
3522 LLVMBuilderRef builder = gallivm->builder;
3523 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3524 LLVMValueRef mxcsr =
3525 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3526
3527 if (util_cpu_caps.has_daz) {
3528 /* Enable denormals are zero mode */
3529 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3530 }
3531 if (zero) {
3532 mxcsr = LLVMBuildOr(builder, mxcsr,
3533 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3534 } else {
3535 mxcsr = LLVMBuildAnd(builder, mxcsr,
3536 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3537 }
3538
3539 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3540 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3541 }
3542 }
3543
3544 void
3545 lp_build_fpstate_set(struct gallivm_state *gallivm,
3546 LLVMValueRef mxcsr_ptr)
3547 {
3548 if (util_cpu_caps.has_sse) {
3549 LLVMBuilderRef builder = gallivm->builder;
3550 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3551 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3552 lp_build_intrinsic(builder,
3553 "llvm.x86.sse.ldmxcsr",
3554 LLVMVoidTypeInContext(gallivm->context),
3555 &mxcsr_ptr, 1, 0);
3556 }
3557 }