gallivm: don't use integer min/max sse intrinsics with llvm >= 3.9
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_sse2 && type.length >= 2) {
147 intr_size = 128;
148 if ((type.width == 8 || type.width == 16) &&
149 (type.width * type.length <= 64) &&
150 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
151 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
152 __FUNCTION__);
153 }
154 if (type.width == 8 && !type.sign) {
155 intrinsic = "llvm.x86.sse2.pminu.b";
156 }
157 else if (type.width == 16 && type.sign) {
158 intrinsic = "llvm.x86.sse2.pmins.w";
159 }
160 if (util_cpu_caps.has_sse4_1) {
161 if (type.width == 8 && type.sign) {
162 intrinsic = "llvm.x86.sse41.pminsb";
163 }
164 if (type.width == 16 && !type.sign) {
165 intrinsic = "llvm.x86.sse41.pminuw";
166 }
167 if (type.width == 32 && !type.sign) {
168 intrinsic = "llvm.x86.sse41.pminud";
169 }
170 if (type.width == 32 && type.sign) {
171 intrinsic = "llvm.x86.sse41.pminsd";
172 }
173 }
174 } else if (util_cpu_caps.has_altivec) {
175 intr_size = 128;
176 if (type.width == 8) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminub";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsb";
181 }
182 } else if (type.width == 16) {
183 if (!type.sign) {
184 intrinsic = "llvm.ppc.altivec.vminuh";
185 } else {
186 intrinsic = "llvm.ppc.altivec.vminsh";
187 }
188 } else if (type.width == 32) {
189 if (!type.sign) {
190 intrinsic = "llvm.ppc.altivec.vminuw";
191 } else {
192 intrinsic = "llvm.ppc.altivec.vminsw";
193 }
194 }
195 }
196
197 if (intrinsic) {
198 /* We need to handle nan's for floating point numbers. If one of the
199 * inputs is nan the other should be returned (required by both D3D10+
200 * and OpenCL).
201 * The sse intrinsics return the second operator in case of nan by
202 * default so we need to special code to handle those.
203 */
204 if (util_cpu_caps.has_sse && type.floating &&
205 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
206 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
207 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
208 LLVMValueRef isnan, min;
209 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
210 type,
211 intr_size, a, b);
212 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
213 isnan = lp_build_isnan(bld, b);
214 return lp_build_select(bld, isnan, a, min);
215 } else {
216 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
217 isnan = lp_build_isnan(bld, a);
218 return lp_build_select(bld, isnan, a, min);
219 }
220 } else {
221 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
222 type,
223 intr_size, a, b);
224 }
225 }
226
227 if (type.floating) {
228 switch (nan_behavior) {
229 case GALLIVM_NAN_RETURN_NAN: {
230 LLVMValueRef isnan = lp_build_isnan(bld, b);
231 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
232 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
233 return lp_build_select(bld, cond, a, b);
234 }
235 break;
236 case GALLIVM_NAN_RETURN_OTHER: {
237 LLVMValueRef isnan = lp_build_isnan(bld, a);
238 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
239 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
240 return lp_build_select(bld, cond, a, b);
241 }
242 break;
243 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
244 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
245 return lp_build_select(bld, cond, a, b);
246 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
247 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
248 return lp_build_select(bld, cond, b, a);
249 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 break;
253 default:
254 assert(0);
255 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
256 return lp_build_select(bld, cond, a, b);
257 }
258 } else {
259 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
260 return lp_build_select(bld, cond, a, b);
261 }
262 }
263
264
265 LLVMValueRef
266 lp_build_fmuladd(LLVMBuilderRef builder,
267 LLVMValueRef a,
268 LLVMValueRef b,
269 LLVMValueRef c)
270 {
271 LLVMTypeRef type = LLVMTypeOf(a);
272 assert(type == LLVMTypeOf(b));
273 assert(type == LLVMTypeOf(c));
274 if (HAVE_LLVM < 0x0304) {
275 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
276 * not supported, and instead it falls-back to a C function.
277 */
278 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
279 }
280 char intrinsic[32];
281 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
282 LLVMValueRef args[] = { a, b, c };
283 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
284 }
285
286
287 /**
288 * Generate max(a, b)
289 * No checks for special case values of a or b = 1 or 0 are done.
290 * NaN's are handled according to the behavior specified by the
291 * nan_behavior argument.
292 */
293 static LLVMValueRef
294 lp_build_max_simple(struct lp_build_context *bld,
295 LLVMValueRef a,
296 LLVMValueRef b,
297 enum gallivm_nan_behavior nan_behavior)
298 {
299 const struct lp_type type = bld->type;
300 const char *intrinsic = NULL;
301 unsigned intr_size = 0;
302 LLVMValueRef cond;
303
304 assert(lp_check_value(type, a));
305 assert(lp_check_value(type, b));
306
307 /* TODO: optimize the constant case */
308
309 if (type.floating && util_cpu_caps.has_sse) {
310 if (type.width == 32) {
311 if (type.length == 1) {
312 intrinsic = "llvm.x86.sse.max.ss";
313 intr_size = 128;
314 }
315 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
316 intrinsic = "llvm.x86.sse.max.ps";
317 intr_size = 128;
318 }
319 else {
320 intrinsic = "llvm.x86.avx.max.ps.256";
321 intr_size = 256;
322 }
323 }
324 if (type.width == 64 && util_cpu_caps.has_sse2) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse2.max.sd";
327 intr_size = 128;
328 }
329 else if (type.length == 2 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse2.max.pd";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.pd.256";
335 intr_size = 256;
336 }
337 }
338 }
339 else if (type.floating && util_cpu_caps.has_altivec) {
340 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
341 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
342 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
343 __FUNCTION__);
344 }
345 if (type.width == 32 || type.length == 4) {
346 intrinsic = "llvm.ppc.altivec.vmaxfp";
347 intr_size = 128;
348 }
349 } else if (HAVE_LLVM < 0x0309 &&
350 util_cpu_caps.has_sse2 && type.length >= 2) {
351 intr_size = 128;
352 if ((type.width == 8 || type.width == 16) &&
353 (type.width * type.length <= 64) &&
354 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
355 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
356 __FUNCTION__);
357 }
358 if (type.width == 8 && !type.sign) {
359 intrinsic = "llvm.x86.sse2.pmaxu.b";
360 intr_size = 128;
361 }
362 else if (type.width == 16 && type.sign) {
363 intrinsic = "llvm.x86.sse2.pmaxs.w";
364 }
365 if (util_cpu_caps.has_sse4_1) {
366 if (type.width == 8 && type.sign) {
367 intrinsic = "llvm.x86.sse41.pmaxsb";
368 }
369 if (type.width == 16 && !type.sign) {
370 intrinsic = "llvm.x86.sse41.pmaxuw";
371 }
372 if (type.width == 32 && !type.sign) {
373 intrinsic = "llvm.x86.sse41.pmaxud";
374 }
375 if (type.width == 32 && type.sign) {
376 intrinsic = "llvm.x86.sse41.pmaxsd";
377 }
378 }
379 } else if (util_cpu_caps.has_altivec) {
380 intr_size = 128;
381 if (type.width == 8) {
382 if (!type.sign) {
383 intrinsic = "llvm.ppc.altivec.vmaxub";
384 } else {
385 intrinsic = "llvm.ppc.altivec.vmaxsb";
386 }
387 } else if (type.width == 16) {
388 if (!type.sign) {
389 intrinsic = "llvm.ppc.altivec.vmaxuh";
390 } else {
391 intrinsic = "llvm.ppc.altivec.vmaxsh";
392 }
393 } else if (type.width == 32) {
394 if (!type.sign) {
395 intrinsic = "llvm.ppc.altivec.vmaxuw";
396 } else {
397 intrinsic = "llvm.ppc.altivec.vmaxsw";
398 }
399 }
400 }
401
402 if (intrinsic) {
403 if (util_cpu_caps.has_sse && type.floating &&
404 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
405 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
406 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
407 LLVMValueRef isnan, max;
408 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
409 type,
410 intr_size, a, b);
411 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
412 isnan = lp_build_isnan(bld, b);
413 return lp_build_select(bld, isnan, a, max);
414 } else {
415 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
416 isnan = lp_build_isnan(bld, a);
417 return lp_build_select(bld, isnan, a, max);
418 }
419 } else {
420 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
421 type,
422 intr_size, a, b);
423 }
424 }
425
426 if (type.floating) {
427 switch (nan_behavior) {
428 case GALLIVM_NAN_RETURN_NAN: {
429 LLVMValueRef isnan = lp_build_isnan(bld, b);
430 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
431 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
432 return lp_build_select(bld, cond, a, b);
433 }
434 break;
435 case GALLIVM_NAN_RETURN_OTHER: {
436 LLVMValueRef isnan = lp_build_isnan(bld, a);
437 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
438 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
439 return lp_build_select(bld, cond, a, b);
440 }
441 break;
442 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
443 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
444 return lp_build_select(bld, cond, a, b);
445 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
446 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
447 return lp_build_select(bld, cond, b, a);
448 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
449 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
450 return lp_build_select(bld, cond, a, b);
451 break;
452 default:
453 assert(0);
454 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
455 return lp_build_select(bld, cond, a, b);
456 }
457 } else {
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 return lp_build_select(bld, cond, a, b);
460 }
461 }
462
463
464 /**
465 * Generate 1 - a, or ~a depending on bld->type.
466 */
467 LLVMValueRef
468 lp_build_comp(struct lp_build_context *bld,
469 LLVMValueRef a)
470 {
471 LLVMBuilderRef builder = bld->gallivm->builder;
472 const struct lp_type type = bld->type;
473
474 assert(lp_check_value(type, a));
475
476 if(a == bld->one)
477 return bld->zero;
478 if(a == bld->zero)
479 return bld->one;
480
481 if(type.norm && !type.floating && !type.fixed && !type.sign) {
482 if(LLVMIsConstant(a))
483 return LLVMConstNot(a);
484 else
485 return LLVMBuildNot(builder, a, "");
486 }
487
488 if(LLVMIsConstant(a))
489 if (type.floating)
490 return LLVMConstFSub(bld->one, a);
491 else
492 return LLVMConstSub(bld->one, a);
493 else
494 if (type.floating)
495 return LLVMBuildFSub(builder, bld->one, a, "");
496 else
497 return LLVMBuildSub(builder, bld->one, a, "");
498 }
499
500
501 /**
502 * Generate a + b
503 */
504 LLVMValueRef
505 lp_build_add(struct lp_build_context *bld,
506 LLVMValueRef a,
507 LLVMValueRef b)
508 {
509 LLVMBuilderRef builder = bld->gallivm->builder;
510 const struct lp_type type = bld->type;
511 LLVMValueRef res;
512
513 assert(lp_check_value(type, a));
514 assert(lp_check_value(type, b));
515
516 if(a == bld->zero)
517 return b;
518 if(b == bld->zero)
519 return a;
520 if(a == bld->undef || b == bld->undef)
521 return bld->undef;
522
523 if(bld->type.norm) {
524 const char *intrinsic = NULL;
525
526 if(a == bld->one || b == bld->one)
527 return bld->one;
528
529 if (type.width * type.length == 128 &&
530 !type.floating && !type.fixed) {
531 if(util_cpu_caps.has_sse2) {
532 if(type.width == 8)
533 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
534 if(type.width == 16)
535 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
536 } else if (util_cpu_caps.has_altivec) {
537 if(type.width == 8)
538 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
539 if(type.width == 16)
540 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
541 }
542 }
543
544 if (intrinsic)
545 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
546 }
547
548 if(type.norm && !type.floating && !type.fixed) {
549 if (type.sign) {
550 uint64_t sign = (uint64_t)1 << (type.width - 1);
551 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
552 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
553 /* a_clamp_max is the maximum a for positive b,
554 a_clamp_min is the minimum a for negative b. */
555 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
556 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
557 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
558 } else {
559 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
560 }
561 }
562
563 if(LLVMIsConstant(a) && LLVMIsConstant(b))
564 if (type.floating)
565 res = LLVMConstFAdd(a, b);
566 else
567 res = LLVMConstAdd(a, b);
568 else
569 if (type.floating)
570 res = LLVMBuildFAdd(builder, a, b, "");
571 else
572 res = LLVMBuildAdd(builder, a, b, "");
573
574 /* clamp to ceiling of 1.0 */
575 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
576 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
577
578 /* XXX clamp to floor of -1 or 0??? */
579
580 return res;
581 }
582
583
584 /** Return the scalar sum of the elements of a.
585 * Should avoid this operation whenever possible.
586 */
587 LLVMValueRef
588 lp_build_horizontal_add(struct lp_build_context *bld,
589 LLVMValueRef a)
590 {
591 LLVMBuilderRef builder = bld->gallivm->builder;
592 const struct lp_type type = bld->type;
593 LLVMValueRef index, res;
594 unsigned i, length;
595 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
596 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
597 LLVMValueRef vecres, elem2;
598
599 assert(lp_check_value(type, a));
600
601 if (type.length == 1) {
602 return a;
603 }
604
605 assert(!bld->type.norm);
606
607 /*
608 * for byte vectors can do much better with psadbw.
609 * Using repeated shuffle/adds here. Note with multiple vectors
610 * this can be done more efficiently as outlined in the intel
611 * optimization manual.
612 * Note: could cause data rearrangement if used with smaller element
613 * sizes.
614 */
615
616 vecres = a;
617 length = type.length / 2;
618 while (length > 1) {
619 LLVMValueRef vec1, vec2;
620 for (i = 0; i < length; i++) {
621 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
622 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
623 }
624 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
625 LLVMConstVector(shuffles1, length), "");
626 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
627 LLVMConstVector(shuffles2, length), "");
628 if (type.floating) {
629 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
630 }
631 else {
632 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
633 }
634 length = length >> 1;
635 }
636
637 /* always have vector of size 2 here */
638 assert(length == 1);
639
640 index = lp_build_const_int32(bld->gallivm, 0);
641 res = LLVMBuildExtractElement(builder, vecres, index, "");
642 index = lp_build_const_int32(bld->gallivm, 1);
643 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
644
645 if (type.floating)
646 res = LLVMBuildFAdd(builder, res, elem2, "");
647 else
648 res = LLVMBuildAdd(builder, res, elem2, "");
649
650 return res;
651 }
652
653 /**
654 * Return the horizontal sums of 4 float vectors as a float4 vector.
655 * This uses the technique as outlined in Intel Optimization Manual.
656 */
657 static LLVMValueRef
658 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
659 LLVMValueRef src[4])
660 {
661 struct gallivm_state *gallivm = bld->gallivm;
662 LLVMBuilderRef builder = gallivm->builder;
663 LLVMValueRef shuffles[4];
664 LLVMValueRef tmp[4];
665 LLVMValueRef sumtmp[2], shuftmp[2];
666
667 /* lower half of regs */
668 shuffles[0] = lp_build_const_int32(gallivm, 0);
669 shuffles[1] = lp_build_const_int32(gallivm, 1);
670 shuffles[2] = lp_build_const_int32(gallivm, 4);
671 shuffles[3] = lp_build_const_int32(gallivm, 5);
672 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
673 LLVMConstVector(shuffles, 4), "");
674 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
675 LLVMConstVector(shuffles, 4), "");
676
677 /* upper half of regs */
678 shuffles[0] = lp_build_const_int32(gallivm, 2);
679 shuffles[1] = lp_build_const_int32(gallivm, 3);
680 shuffles[2] = lp_build_const_int32(gallivm, 6);
681 shuffles[3] = lp_build_const_int32(gallivm, 7);
682 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
683 LLVMConstVector(shuffles, 4), "");
684 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
685 LLVMConstVector(shuffles, 4), "");
686
687 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
688 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
689
690 shuffles[0] = lp_build_const_int32(gallivm, 0);
691 shuffles[1] = lp_build_const_int32(gallivm, 2);
692 shuffles[2] = lp_build_const_int32(gallivm, 4);
693 shuffles[3] = lp_build_const_int32(gallivm, 6);
694 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
695 LLVMConstVector(shuffles, 4), "");
696
697 shuffles[0] = lp_build_const_int32(gallivm, 1);
698 shuffles[1] = lp_build_const_int32(gallivm, 3);
699 shuffles[2] = lp_build_const_int32(gallivm, 5);
700 shuffles[3] = lp_build_const_int32(gallivm, 7);
701 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
702 LLVMConstVector(shuffles, 4), "");
703
704 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
705 }
706
707
708 /*
709 * partially horizontally add 2-4 float vectors with length nx4,
710 * i.e. only four adjacent values in each vector will be added,
711 * assuming values are really grouped in 4 which also determines
712 * output order.
713 *
714 * Return a vector of the same length as the initial vectors,
715 * with the excess elements (if any) being undefined.
716 * The element order is independent of number of input vectors.
717 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
718 * the output order thus will be
719 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
720 */
721 LLVMValueRef
722 lp_build_hadd_partial4(struct lp_build_context *bld,
723 LLVMValueRef vectors[],
724 unsigned num_vecs)
725 {
726 struct gallivm_state *gallivm = bld->gallivm;
727 LLVMBuilderRef builder = gallivm->builder;
728 LLVMValueRef ret_vec;
729 LLVMValueRef tmp[4];
730 const char *intrinsic = NULL;
731
732 assert(num_vecs >= 2 && num_vecs <= 4);
733 assert(bld->type.floating);
734
735 /* only use this with at least 2 vectors, as it is sort of expensive
736 * (depending on cpu) and we always need two horizontal adds anyway,
737 * so a shuffle/add approach might be better.
738 */
739
740 tmp[0] = vectors[0];
741 tmp[1] = vectors[1];
742
743 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
744 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
745
746 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
747 bld->type.length == 4) {
748 intrinsic = "llvm.x86.sse3.hadd.ps";
749 }
750 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
751 bld->type.length == 8) {
752 intrinsic = "llvm.x86.avx.hadd.ps.256";
753 }
754 if (intrinsic) {
755 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
756 lp_build_vec_type(gallivm, bld->type),
757 tmp[0], tmp[1]);
758 if (num_vecs > 2) {
759 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
760 lp_build_vec_type(gallivm, bld->type),
761 tmp[2], tmp[3]);
762 }
763 else {
764 tmp[1] = tmp[0];
765 }
766 return lp_build_intrinsic_binary(builder, intrinsic,
767 lp_build_vec_type(gallivm, bld->type),
768 tmp[0], tmp[1]);
769 }
770
771 if (bld->type.length == 4) {
772 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
773 }
774 else {
775 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
776 unsigned j;
777 unsigned num_iter = bld->type.length / 4;
778 struct lp_type parttype = bld->type;
779 parttype.length = 4;
780 for (j = 0; j < num_iter; j++) {
781 LLVMValueRef partsrc[4];
782 unsigned i;
783 for (i = 0; i < 4; i++) {
784 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
785 }
786 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
787 }
788 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
789 }
790 return ret_vec;
791 }
792
793 /**
794 * Generate a - b
795 */
796 LLVMValueRef
797 lp_build_sub(struct lp_build_context *bld,
798 LLVMValueRef a,
799 LLVMValueRef b)
800 {
801 LLVMBuilderRef builder = bld->gallivm->builder;
802 const struct lp_type type = bld->type;
803 LLVMValueRef res;
804
805 assert(lp_check_value(type, a));
806 assert(lp_check_value(type, b));
807
808 if(b == bld->zero)
809 return a;
810 if(a == bld->undef || b == bld->undef)
811 return bld->undef;
812 if(a == b)
813 return bld->zero;
814
815 if(bld->type.norm) {
816 const char *intrinsic = NULL;
817
818 if(b == bld->one)
819 return bld->zero;
820
821 if (type.width * type.length == 128 &&
822 !type.floating && !type.fixed) {
823 if (util_cpu_caps.has_sse2) {
824 if(type.width == 8)
825 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
826 if(type.width == 16)
827 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
828 } else if (util_cpu_caps.has_altivec) {
829 if(type.width == 8)
830 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
831 if(type.width == 16)
832 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
833 }
834 }
835
836 if (intrinsic)
837 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
838 }
839
840 if(type.norm && !type.floating && !type.fixed) {
841 if (type.sign) {
842 uint64_t sign = (uint64_t)1 << (type.width - 1);
843 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
844 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
845 /* a_clamp_max is the maximum a for negative b,
846 a_clamp_min is the minimum a for positive b. */
847 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
848 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
849 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
850 } else {
851 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
852 }
853 }
854
855 if(LLVMIsConstant(a) && LLVMIsConstant(b))
856 if (type.floating)
857 res = LLVMConstFSub(a, b);
858 else
859 res = LLVMConstSub(a, b);
860 else
861 if (type.floating)
862 res = LLVMBuildFSub(builder, a, b, "");
863 else
864 res = LLVMBuildSub(builder, a, b, "");
865
866 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
867 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
868
869 return res;
870 }
871
872
873
874 /**
875 * Normalized multiplication.
876 *
877 * There are several approaches for (using 8-bit normalized multiplication as
878 * an example):
879 *
880 * - alpha plus one
881 *
882 * makes the following approximation to the division (Sree)
883 *
884 * a*b/255 ~= (a*(b + 1)) >> 256
885 *
886 * which is the fastest method that satisfies the following OpenGL criteria of
887 *
888 * 0*0 = 0 and 255*255 = 255
889 *
890 * - geometric series
891 *
892 * takes the geometric series approximation to the division
893 *
894 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
895 *
896 * in this case just the first two terms to fit in 16bit arithmetic
897 *
898 * t/255 ~= (t + (t >> 8)) >> 8
899 *
900 * note that just by itself it doesn't satisfies the OpenGL criteria, as
901 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
902 * must be used.
903 *
904 * - geometric series plus rounding
905 *
906 * when using a geometric series division instead of truncating the result
907 * use roundoff in the approximation (Jim Blinn)
908 *
909 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
910 *
911 * achieving the exact results.
912 *
913 *
914 *
915 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
916 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
917 * @sa Michael Herf, The "double blend trick", May 2000,
918 * http://www.stereopsis.com/doubleblend.html
919 */
920 static LLVMValueRef
921 lp_build_mul_norm(struct gallivm_state *gallivm,
922 struct lp_type wide_type,
923 LLVMValueRef a, LLVMValueRef b)
924 {
925 LLVMBuilderRef builder = gallivm->builder;
926 struct lp_build_context bld;
927 unsigned n;
928 LLVMValueRef half;
929 LLVMValueRef ab;
930
931 assert(!wide_type.floating);
932 assert(lp_check_value(wide_type, a));
933 assert(lp_check_value(wide_type, b));
934
935 lp_build_context_init(&bld, gallivm, wide_type);
936
937 n = wide_type.width / 2;
938 if (wide_type.sign) {
939 --n;
940 }
941
942 /*
943 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
944 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
945 */
946
947 /*
948 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
949 */
950
951 ab = LLVMBuildMul(builder, a, b, "");
952 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
953
954 /*
955 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
956 */
957
958 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
959 if (wide_type.sign) {
960 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
961 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
962 half = lp_build_select(&bld, sign, minus_half, half);
963 }
964 ab = LLVMBuildAdd(builder, ab, half, "");
965
966 /* Final division */
967 ab = lp_build_shr_imm(&bld, ab, n);
968
969 return ab;
970 }
971
972 /**
973 * Generate a * b
974 */
975 LLVMValueRef
976 lp_build_mul(struct lp_build_context *bld,
977 LLVMValueRef a,
978 LLVMValueRef b)
979 {
980 LLVMBuilderRef builder = bld->gallivm->builder;
981 const struct lp_type type = bld->type;
982 LLVMValueRef shift;
983 LLVMValueRef res;
984
985 assert(lp_check_value(type, a));
986 assert(lp_check_value(type, b));
987
988 if(a == bld->zero)
989 return bld->zero;
990 if(a == bld->one)
991 return b;
992 if(b == bld->zero)
993 return bld->zero;
994 if(b == bld->one)
995 return a;
996 if(a == bld->undef || b == bld->undef)
997 return bld->undef;
998
999 if (!type.floating && !type.fixed && type.norm) {
1000 struct lp_type wide_type = lp_wider_type(type);
1001 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1002
1003 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
1004 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
1005
1006 /* PMULLW, PSRLW, PADDW */
1007 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1008 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1009
1010 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
1011
1012 return ab;
1013 }
1014
1015 if(type.fixed)
1016 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1017 else
1018 shift = NULL;
1019
1020 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1021 if (type.floating)
1022 res = LLVMConstFMul(a, b);
1023 else
1024 res = LLVMConstMul(a, b);
1025 if(shift) {
1026 if(type.sign)
1027 res = LLVMConstAShr(res, shift);
1028 else
1029 res = LLVMConstLShr(res, shift);
1030 }
1031 }
1032 else {
1033 if (type.floating)
1034 res = LLVMBuildFMul(builder, a, b, "");
1035 else
1036 res = LLVMBuildMul(builder, a, b, "");
1037 if(shift) {
1038 if(type.sign)
1039 res = LLVMBuildAShr(builder, res, shift, "");
1040 else
1041 res = LLVMBuildLShr(builder, res, shift, "");
1042 }
1043 }
1044
1045 return res;
1046 }
1047
1048
1049 /* a * b + c */
1050 LLVMValueRef
1051 lp_build_mad(struct lp_build_context *bld,
1052 LLVMValueRef a,
1053 LLVMValueRef b,
1054 LLVMValueRef c)
1055 {
1056 const struct lp_type type = bld->type;
1057 if (type.floating) {
1058 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1059 } else {
1060 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1061 }
1062 }
1063
1064
1065 /**
1066 * Small vector x scale multiplication optimization.
1067 */
1068 LLVMValueRef
1069 lp_build_mul_imm(struct lp_build_context *bld,
1070 LLVMValueRef a,
1071 int b)
1072 {
1073 LLVMBuilderRef builder = bld->gallivm->builder;
1074 LLVMValueRef factor;
1075
1076 assert(lp_check_value(bld->type, a));
1077
1078 if(b == 0)
1079 return bld->zero;
1080
1081 if(b == 1)
1082 return a;
1083
1084 if(b == -1)
1085 return lp_build_negate(bld, a);
1086
1087 if(b == 2 && bld->type.floating)
1088 return lp_build_add(bld, a, a);
1089
1090 if(util_is_power_of_two(b)) {
1091 unsigned shift = ffs(b) - 1;
1092
1093 if(bld->type.floating) {
1094 #if 0
1095 /*
1096 * Power of two multiplication by directly manipulating the exponent.
1097 *
1098 * XXX: This might not be always faster, it will introduce a small error
1099 * for multiplication by zero, and it will produce wrong results
1100 * for Inf and NaN.
1101 */
1102 unsigned mantissa = lp_mantissa(bld->type);
1103 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1104 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1105 a = LLVMBuildAdd(builder, a, factor, "");
1106 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1107 return a;
1108 #endif
1109 }
1110 else {
1111 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1112 return LLVMBuildShl(builder, a, factor, "");
1113 }
1114 }
1115
1116 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1117 return lp_build_mul(bld, a, factor);
1118 }
1119
1120
1121 /**
1122 * Generate a / b
1123 */
1124 LLVMValueRef
1125 lp_build_div(struct lp_build_context *bld,
1126 LLVMValueRef a,
1127 LLVMValueRef b)
1128 {
1129 LLVMBuilderRef builder = bld->gallivm->builder;
1130 const struct lp_type type = bld->type;
1131
1132 assert(lp_check_value(type, a));
1133 assert(lp_check_value(type, b));
1134
1135 if(a == bld->zero)
1136 return bld->zero;
1137 if(a == bld->one && type.floating)
1138 return lp_build_rcp(bld, b);
1139 if(b == bld->zero)
1140 return bld->undef;
1141 if(b == bld->one)
1142 return a;
1143 if(a == bld->undef || b == bld->undef)
1144 return bld->undef;
1145
1146 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1147 if (type.floating)
1148 return LLVMConstFDiv(a, b);
1149 else if (type.sign)
1150 return LLVMConstSDiv(a, b);
1151 else
1152 return LLVMConstUDiv(a, b);
1153 }
1154
1155 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1156 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1157 type.floating)
1158 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1159
1160 if (type.floating)
1161 return LLVMBuildFDiv(builder, a, b, "");
1162 else if (type.sign)
1163 return LLVMBuildSDiv(builder, a, b, "");
1164 else
1165 return LLVMBuildUDiv(builder, a, b, "");
1166 }
1167
1168
1169 /**
1170 * Linear interpolation helper.
1171 *
1172 * @param normalized whether we are interpolating normalized values,
1173 * encoded in normalized integers, twice as wide.
1174 *
1175 * @sa http://www.stereopsis.com/doubleblend.html
1176 */
1177 static inline LLVMValueRef
1178 lp_build_lerp_simple(struct lp_build_context *bld,
1179 LLVMValueRef x,
1180 LLVMValueRef v0,
1181 LLVMValueRef v1,
1182 unsigned flags)
1183 {
1184 unsigned half_width = bld->type.width/2;
1185 LLVMBuilderRef builder = bld->gallivm->builder;
1186 LLVMValueRef delta;
1187 LLVMValueRef res;
1188
1189 assert(lp_check_value(bld->type, x));
1190 assert(lp_check_value(bld->type, v0));
1191 assert(lp_check_value(bld->type, v1));
1192
1193 delta = lp_build_sub(bld, v1, v0);
1194
1195 if (bld->type.floating) {
1196 assert(flags == 0);
1197 return lp_build_mad(bld, x, delta, v0);
1198 }
1199
1200 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1201 if (!bld->type.sign) {
1202 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1203 /*
1204 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1205 * most-significant-bit to the lowest-significant-bit, so that
1206 * later we can just divide by 2**n instead of 2**n - 1.
1207 */
1208
1209 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1210 }
1211
1212 /* (x * delta) >> n */
1213 res = lp_build_mul(bld, x, delta);
1214 res = lp_build_shr_imm(bld, res, half_width);
1215 } else {
1216 /*
1217 * The rescaling trick above doesn't work for signed numbers, so
1218 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1219 * instead.
1220 */
1221 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1222 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1223 }
1224 } else {
1225 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1226 res = lp_build_mul(bld, x, delta);
1227 }
1228
1229 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1230 /*
1231 * At this point both res and v0 only use the lower half of the bits,
1232 * the rest is zero. Instead of add / mask, do add with half wide type.
1233 */
1234 struct lp_type narrow_type;
1235 struct lp_build_context narrow_bld;
1236
1237 memset(&narrow_type, 0, sizeof narrow_type);
1238 narrow_type.sign = bld->type.sign;
1239 narrow_type.width = bld->type.width/2;
1240 narrow_type.length = bld->type.length*2;
1241
1242 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1243 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1244 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1245 res = lp_build_add(&narrow_bld, v0, res);
1246 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1247 } else {
1248 res = lp_build_add(bld, v0, res);
1249
1250 if (bld->type.fixed) {
1251 /*
1252 * We need to mask out the high order bits when lerping 8bit
1253 * normalized colors stored on 16bits
1254 */
1255 /* XXX: This step is necessary for lerping 8bit colors stored on
1256 * 16bits, but it will be wrong for true fixed point use cases.
1257 * Basically we need a more powerful lp_type, capable of further
1258 * distinguishing the values interpretation from the value storage.
1259 */
1260 LLVMValueRef low_bits;
1261 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1262 res = LLVMBuildAnd(builder, res, low_bits, "");
1263 }
1264 }
1265
1266 return res;
1267 }
1268
1269
1270 /**
1271 * Linear interpolation.
1272 */
1273 LLVMValueRef
1274 lp_build_lerp(struct lp_build_context *bld,
1275 LLVMValueRef x,
1276 LLVMValueRef v0,
1277 LLVMValueRef v1,
1278 unsigned flags)
1279 {
1280 const struct lp_type type = bld->type;
1281 LLVMValueRef res;
1282
1283 assert(lp_check_value(type, x));
1284 assert(lp_check_value(type, v0));
1285 assert(lp_check_value(type, v1));
1286
1287 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1288
1289 if (type.norm) {
1290 struct lp_type wide_type;
1291 struct lp_build_context wide_bld;
1292 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1293
1294 assert(type.length >= 2);
1295
1296 /*
1297 * Create a wider integer type, enough to hold the
1298 * intermediate result of the multiplication.
1299 */
1300 memset(&wide_type, 0, sizeof wide_type);
1301 wide_type.sign = type.sign;
1302 wide_type.width = type.width*2;
1303 wide_type.length = type.length/2;
1304
1305 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1306
1307 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1308 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1309 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1310
1311 /*
1312 * Lerp both halves.
1313 */
1314
1315 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1316
1317 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1318 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1319
1320 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1321 } else {
1322 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1323 }
1324
1325 return res;
1326 }
1327
1328
1329 /**
1330 * Bilinear interpolation.
1331 *
1332 * Values indices are in v_{yx}.
1333 */
1334 LLVMValueRef
1335 lp_build_lerp_2d(struct lp_build_context *bld,
1336 LLVMValueRef x,
1337 LLVMValueRef y,
1338 LLVMValueRef v00,
1339 LLVMValueRef v01,
1340 LLVMValueRef v10,
1341 LLVMValueRef v11,
1342 unsigned flags)
1343 {
1344 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1345 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1346 return lp_build_lerp(bld, y, v0, v1, flags);
1347 }
1348
1349
1350 LLVMValueRef
1351 lp_build_lerp_3d(struct lp_build_context *bld,
1352 LLVMValueRef x,
1353 LLVMValueRef y,
1354 LLVMValueRef z,
1355 LLVMValueRef v000,
1356 LLVMValueRef v001,
1357 LLVMValueRef v010,
1358 LLVMValueRef v011,
1359 LLVMValueRef v100,
1360 LLVMValueRef v101,
1361 LLVMValueRef v110,
1362 LLVMValueRef v111,
1363 unsigned flags)
1364 {
1365 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1366 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1367 return lp_build_lerp(bld, z, v0, v1, flags);
1368 }
1369
1370
1371 /**
1372 * Generate min(a, b)
1373 * Do checks for special cases but not for nans.
1374 */
1375 LLVMValueRef
1376 lp_build_min(struct lp_build_context *bld,
1377 LLVMValueRef a,
1378 LLVMValueRef b)
1379 {
1380 assert(lp_check_value(bld->type, a));
1381 assert(lp_check_value(bld->type, b));
1382
1383 if(a == bld->undef || b == bld->undef)
1384 return bld->undef;
1385
1386 if(a == b)
1387 return a;
1388
1389 if (bld->type.norm) {
1390 if (!bld->type.sign) {
1391 if (a == bld->zero || b == bld->zero) {
1392 return bld->zero;
1393 }
1394 }
1395 if(a == bld->one)
1396 return b;
1397 if(b == bld->one)
1398 return a;
1399 }
1400
1401 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1402 }
1403
1404
1405 /**
1406 * Generate min(a, b)
1407 * NaN's are handled according to the behavior specified by the
1408 * nan_behavior argument.
1409 */
1410 LLVMValueRef
1411 lp_build_min_ext(struct lp_build_context *bld,
1412 LLVMValueRef a,
1413 LLVMValueRef b,
1414 enum gallivm_nan_behavior nan_behavior)
1415 {
1416 assert(lp_check_value(bld->type, a));
1417 assert(lp_check_value(bld->type, b));
1418
1419 if(a == bld->undef || b == bld->undef)
1420 return bld->undef;
1421
1422 if(a == b)
1423 return a;
1424
1425 if (bld->type.norm) {
1426 if (!bld->type.sign) {
1427 if (a == bld->zero || b == bld->zero) {
1428 return bld->zero;
1429 }
1430 }
1431 if(a == bld->one)
1432 return b;
1433 if(b == bld->one)
1434 return a;
1435 }
1436
1437 return lp_build_min_simple(bld, a, b, nan_behavior);
1438 }
1439
1440 /**
1441 * Generate max(a, b)
1442 * Do checks for special cases, but NaN behavior is undefined.
1443 */
1444 LLVMValueRef
1445 lp_build_max(struct lp_build_context *bld,
1446 LLVMValueRef a,
1447 LLVMValueRef b)
1448 {
1449 assert(lp_check_value(bld->type, a));
1450 assert(lp_check_value(bld->type, b));
1451
1452 if(a == bld->undef || b == bld->undef)
1453 return bld->undef;
1454
1455 if(a == b)
1456 return a;
1457
1458 if(bld->type.norm) {
1459 if(a == bld->one || b == bld->one)
1460 return bld->one;
1461 if (!bld->type.sign) {
1462 if (a == bld->zero) {
1463 return b;
1464 }
1465 if (b == bld->zero) {
1466 return a;
1467 }
1468 }
1469 }
1470
1471 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1472 }
1473
1474
1475 /**
1476 * Generate max(a, b)
1477 * Checks for special cases.
1478 * NaN's are handled according to the behavior specified by the
1479 * nan_behavior argument.
1480 */
1481 LLVMValueRef
1482 lp_build_max_ext(struct lp_build_context *bld,
1483 LLVMValueRef a,
1484 LLVMValueRef b,
1485 enum gallivm_nan_behavior nan_behavior)
1486 {
1487 assert(lp_check_value(bld->type, a));
1488 assert(lp_check_value(bld->type, b));
1489
1490 if(a == bld->undef || b == bld->undef)
1491 return bld->undef;
1492
1493 if(a == b)
1494 return a;
1495
1496 if(bld->type.norm) {
1497 if(a == bld->one || b == bld->one)
1498 return bld->one;
1499 if (!bld->type.sign) {
1500 if (a == bld->zero) {
1501 return b;
1502 }
1503 if (b == bld->zero) {
1504 return a;
1505 }
1506 }
1507 }
1508
1509 return lp_build_max_simple(bld, a, b, nan_behavior);
1510 }
1511
1512 /**
1513 * Generate clamp(a, min, max)
1514 * NaN behavior (for any of a, min, max) is undefined.
1515 * Do checks for special cases.
1516 */
1517 LLVMValueRef
1518 lp_build_clamp(struct lp_build_context *bld,
1519 LLVMValueRef a,
1520 LLVMValueRef min,
1521 LLVMValueRef max)
1522 {
1523 assert(lp_check_value(bld->type, a));
1524 assert(lp_check_value(bld->type, min));
1525 assert(lp_check_value(bld->type, max));
1526
1527 a = lp_build_min(bld, a, max);
1528 a = lp_build_max(bld, a, min);
1529 return a;
1530 }
1531
1532
1533 /**
1534 * Generate clamp(a, 0, 1)
1535 * A NaN will get converted to zero.
1536 */
1537 LLVMValueRef
1538 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1539 LLVMValueRef a)
1540 {
1541 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1542 a = lp_build_min(bld, a, bld->one);
1543 return a;
1544 }
1545
1546
1547 /**
1548 * Generate abs(a)
1549 */
1550 LLVMValueRef
1551 lp_build_abs(struct lp_build_context *bld,
1552 LLVMValueRef a)
1553 {
1554 LLVMBuilderRef builder = bld->gallivm->builder;
1555 const struct lp_type type = bld->type;
1556 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1557
1558 assert(lp_check_value(type, a));
1559
1560 if(!type.sign)
1561 return a;
1562
1563 if(type.floating) {
1564 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1565 /* Workaround llvm.org/PR27332 */
1566 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1567 unsigned long long absMask = ~(1ULL << (type.width - 1));
1568 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1569 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1570 a = LLVMBuildAnd(builder, a, mask, "");
1571 a = LLVMBuildBitCast(builder, a, vec_type, "");
1572 return a;
1573 } else {
1574 char intrinsic[32];
1575 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1576 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1577 }
1578 }
1579
1580 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1581 switch(type.width) {
1582 case 8:
1583 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1584 case 16:
1585 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1586 case 32:
1587 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1588 }
1589 }
1590 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1591 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1592 (type.width == 8 || type.width == 16 || type.width == 32)) {
1593 debug_printf("%s: inefficient code, should split vectors manually\n",
1594 __FUNCTION__);
1595 }
1596
1597 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1598 }
1599
1600
1601 LLVMValueRef
1602 lp_build_negate(struct lp_build_context *bld,
1603 LLVMValueRef a)
1604 {
1605 LLVMBuilderRef builder = bld->gallivm->builder;
1606
1607 assert(lp_check_value(bld->type, a));
1608
1609 if (bld->type.floating)
1610 a = LLVMBuildFNeg(builder, a, "");
1611 else
1612 a = LLVMBuildNeg(builder, a, "");
1613
1614 return a;
1615 }
1616
1617
1618 /** Return -1, 0 or +1 depending on the sign of a */
1619 LLVMValueRef
1620 lp_build_sgn(struct lp_build_context *bld,
1621 LLVMValueRef a)
1622 {
1623 LLVMBuilderRef builder = bld->gallivm->builder;
1624 const struct lp_type type = bld->type;
1625 LLVMValueRef cond;
1626 LLVMValueRef res;
1627
1628 assert(lp_check_value(type, a));
1629
1630 /* Handle non-zero case */
1631 if(!type.sign) {
1632 /* if not zero then sign must be positive */
1633 res = bld->one;
1634 }
1635 else if(type.floating) {
1636 LLVMTypeRef vec_type;
1637 LLVMTypeRef int_type;
1638 LLVMValueRef mask;
1639 LLVMValueRef sign;
1640 LLVMValueRef one;
1641 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1642
1643 int_type = lp_build_int_vec_type(bld->gallivm, type);
1644 vec_type = lp_build_vec_type(bld->gallivm, type);
1645 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1646
1647 /* Take the sign bit and add it to 1 constant */
1648 sign = LLVMBuildBitCast(builder, a, int_type, "");
1649 sign = LLVMBuildAnd(builder, sign, mask, "");
1650 one = LLVMConstBitCast(bld->one, int_type);
1651 res = LLVMBuildOr(builder, sign, one, "");
1652 res = LLVMBuildBitCast(builder, res, vec_type, "");
1653 }
1654 else
1655 {
1656 /* signed int/norm/fixed point */
1657 /* could use psign with sse3 and appropriate vectors here */
1658 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1659 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1660 res = lp_build_select(bld, cond, bld->one, minus_one);
1661 }
1662
1663 /* Handle zero */
1664 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1665 res = lp_build_select(bld, cond, bld->zero, res);
1666
1667 return res;
1668 }
1669
1670
1671 /**
1672 * Set the sign of float vector 'a' according to 'sign'.
1673 * If sign==0, return abs(a).
1674 * If sign==1, return -abs(a);
1675 * Other values for sign produce undefined results.
1676 */
1677 LLVMValueRef
1678 lp_build_set_sign(struct lp_build_context *bld,
1679 LLVMValueRef a, LLVMValueRef sign)
1680 {
1681 LLVMBuilderRef builder = bld->gallivm->builder;
1682 const struct lp_type type = bld->type;
1683 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1684 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1685 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1686 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1687 ~((unsigned long long) 1 << (type.width - 1)));
1688 LLVMValueRef val, res;
1689
1690 assert(type.floating);
1691 assert(lp_check_value(type, a));
1692
1693 /* val = reinterpret_cast<int>(a) */
1694 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1695 /* val = val & mask */
1696 val = LLVMBuildAnd(builder, val, mask, "");
1697 /* sign = sign << shift */
1698 sign = LLVMBuildShl(builder, sign, shift, "");
1699 /* res = val | sign */
1700 res = LLVMBuildOr(builder, val, sign, "");
1701 /* res = reinterpret_cast<float>(res) */
1702 res = LLVMBuildBitCast(builder, res, vec_type, "");
1703
1704 return res;
1705 }
1706
1707
1708 /**
1709 * Convert vector of (or scalar) int to vector of (or scalar) float.
1710 */
1711 LLVMValueRef
1712 lp_build_int_to_float(struct lp_build_context *bld,
1713 LLVMValueRef a)
1714 {
1715 LLVMBuilderRef builder = bld->gallivm->builder;
1716 const struct lp_type type = bld->type;
1717 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1718
1719 assert(type.floating);
1720
1721 return LLVMBuildSIToFP(builder, a, vec_type, "");
1722 }
1723
1724 static boolean
1725 arch_rounding_available(const struct lp_type type)
1726 {
1727 if ((util_cpu_caps.has_sse4_1 &&
1728 (type.length == 1 || type.width*type.length == 128)) ||
1729 (util_cpu_caps.has_avx && type.width*type.length == 256))
1730 return TRUE;
1731 else if ((util_cpu_caps.has_altivec &&
1732 (type.width == 32 && type.length == 4)))
1733 return TRUE;
1734
1735 return FALSE;
1736 }
1737
1738 enum lp_build_round_mode
1739 {
1740 LP_BUILD_ROUND_NEAREST = 0,
1741 LP_BUILD_ROUND_FLOOR = 1,
1742 LP_BUILD_ROUND_CEIL = 2,
1743 LP_BUILD_ROUND_TRUNCATE = 3
1744 };
1745
1746 static inline LLVMValueRef
1747 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1748 LLVMValueRef a)
1749 {
1750 LLVMBuilderRef builder = bld->gallivm->builder;
1751 const struct lp_type type = bld->type;
1752 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1753 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1754 const char *intrinsic;
1755 LLVMValueRef res;
1756
1757 assert(type.floating);
1758 /* using the double precision conversions is a bit more complicated */
1759 assert(type.width == 32);
1760
1761 assert(lp_check_value(type, a));
1762 assert(util_cpu_caps.has_sse2);
1763
1764 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1765 if (type.length == 1) {
1766 LLVMTypeRef vec_type;
1767 LLVMValueRef undef;
1768 LLVMValueRef arg;
1769 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1770
1771 vec_type = LLVMVectorType(bld->elem_type, 4);
1772
1773 intrinsic = "llvm.x86.sse.cvtss2si";
1774
1775 undef = LLVMGetUndef(vec_type);
1776
1777 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1778
1779 res = lp_build_intrinsic_unary(builder, intrinsic,
1780 ret_type, arg);
1781 }
1782 else {
1783 if (type.width* type.length == 128) {
1784 intrinsic = "llvm.x86.sse2.cvtps2dq";
1785 }
1786 else {
1787 assert(type.width*type.length == 256);
1788 assert(util_cpu_caps.has_avx);
1789
1790 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1791 }
1792 res = lp_build_intrinsic_unary(builder, intrinsic,
1793 ret_type, a);
1794 }
1795
1796 return res;
1797 }
1798
1799
1800 /*
1801 */
1802 static inline LLVMValueRef
1803 lp_build_round_altivec(struct lp_build_context *bld,
1804 LLVMValueRef a,
1805 enum lp_build_round_mode mode)
1806 {
1807 LLVMBuilderRef builder = bld->gallivm->builder;
1808 const struct lp_type type = bld->type;
1809 const char *intrinsic = NULL;
1810
1811 assert(type.floating);
1812
1813 assert(lp_check_value(type, a));
1814 assert(util_cpu_caps.has_altivec);
1815
1816 (void)type;
1817
1818 switch (mode) {
1819 case LP_BUILD_ROUND_NEAREST:
1820 intrinsic = "llvm.ppc.altivec.vrfin";
1821 break;
1822 case LP_BUILD_ROUND_FLOOR:
1823 intrinsic = "llvm.ppc.altivec.vrfim";
1824 break;
1825 case LP_BUILD_ROUND_CEIL:
1826 intrinsic = "llvm.ppc.altivec.vrfip";
1827 break;
1828 case LP_BUILD_ROUND_TRUNCATE:
1829 intrinsic = "llvm.ppc.altivec.vrfiz";
1830 break;
1831 }
1832
1833 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1834 }
1835
1836 static inline LLVMValueRef
1837 lp_build_round_arch(struct lp_build_context *bld,
1838 LLVMValueRef a,
1839 enum lp_build_round_mode mode)
1840 {
1841 if (util_cpu_caps.has_sse4_1) {
1842 LLVMBuilderRef builder = bld->gallivm->builder;
1843 const struct lp_type type = bld->type;
1844 const char *intrinsic_root;
1845 char intrinsic[32];
1846
1847 assert(type.floating);
1848 assert(lp_check_value(type, a));
1849 (void)type;
1850
1851 switch (mode) {
1852 case LP_BUILD_ROUND_NEAREST:
1853 intrinsic_root = "llvm.nearbyint";
1854 break;
1855 case LP_BUILD_ROUND_FLOOR:
1856 intrinsic_root = "llvm.floor";
1857 break;
1858 case LP_BUILD_ROUND_CEIL:
1859 intrinsic_root = "llvm.ceil";
1860 break;
1861 case LP_BUILD_ROUND_TRUNCATE:
1862 intrinsic_root = "llvm.trunc";
1863 break;
1864 }
1865
1866 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1867 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1868 }
1869 else /* (util_cpu_caps.has_altivec) */
1870 return lp_build_round_altivec(bld, a, mode);
1871 }
1872
1873 /**
1874 * Return the integer part of a float (vector) value (== round toward zero).
1875 * The returned value is a float (vector).
1876 * Ex: trunc(-1.5) = -1.0
1877 */
1878 LLVMValueRef
1879 lp_build_trunc(struct lp_build_context *bld,
1880 LLVMValueRef a)
1881 {
1882 LLVMBuilderRef builder = bld->gallivm->builder;
1883 const struct lp_type type = bld->type;
1884
1885 assert(type.floating);
1886 assert(lp_check_value(type, a));
1887
1888 if (arch_rounding_available(type)) {
1889 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1890 }
1891 else {
1892 const struct lp_type type = bld->type;
1893 struct lp_type inttype;
1894 struct lp_build_context intbld;
1895 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1896 LLVMValueRef trunc, res, anosign, mask;
1897 LLVMTypeRef int_vec_type = bld->int_vec_type;
1898 LLVMTypeRef vec_type = bld->vec_type;
1899
1900 assert(type.width == 32); /* might want to handle doubles at some point */
1901
1902 inttype = type;
1903 inttype.floating = 0;
1904 lp_build_context_init(&intbld, bld->gallivm, inttype);
1905
1906 /* round by truncation */
1907 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1908 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1909
1910 /* mask out sign bit */
1911 anosign = lp_build_abs(bld, a);
1912 /*
1913 * mask out all values if anosign > 2^24
1914 * This should work both for large ints (all rounding is no-op for them
1915 * because such floats are always exact) as well as special cases like
1916 * NaNs, Infs (taking advantage of the fact they use max exponent).
1917 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1918 */
1919 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1920 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1921 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1922 return lp_build_select(bld, mask, a, res);
1923 }
1924 }
1925
1926
1927 /**
1928 * Return float (vector) rounded to nearest integer (vector). The returned
1929 * value is a float (vector).
1930 * Ex: round(0.9) = 1.0
1931 * Ex: round(-1.5) = -2.0
1932 */
1933 LLVMValueRef
1934 lp_build_round(struct lp_build_context *bld,
1935 LLVMValueRef a)
1936 {
1937 LLVMBuilderRef builder = bld->gallivm->builder;
1938 const struct lp_type type = bld->type;
1939
1940 assert(type.floating);
1941 assert(lp_check_value(type, a));
1942
1943 if (arch_rounding_available(type)) {
1944 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1945 }
1946 else {
1947 const struct lp_type type = bld->type;
1948 struct lp_type inttype;
1949 struct lp_build_context intbld;
1950 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1951 LLVMValueRef res, anosign, mask;
1952 LLVMTypeRef int_vec_type = bld->int_vec_type;
1953 LLVMTypeRef vec_type = bld->vec_type;
1954
1955 assert(type.width == 32); /* might want to handle doubles at some point */
1956
1957 inttype = type;
1958 inttype.floating = 0;
1959 lp_build_context_init(&intbld, bld->gallivm, inttype);
1960
1961 res = lp_build_iround(bld, a);
1962 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1963
1964 /* mask out sign bit */
1965 anosign = lp_build_abs(bld, a);
1966 /*
1967 * mask out all values if anosign > 2^24
1968 * This should work both for large ints (all rounding is no-op for them
1969 * because such floats are always exact) as well as special cases like
1970 * NaNs, Infs (taking advantage of the fact they use max exponent).
1971 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1972 */
1973 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1974 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1975 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1976 return lp_build_select(bld, mask, a, res);
1977 }
1978 }
1979
1980
1981 /**
1982 * Return floor of float (vector), result is a float (vector)
1983 * Ex: floor(1.1) = 1.0
1984 * Ex: floor(-1.1) = -2.0
1985 */
1986 LLVMValueRef
1987 lp_build_floor(struct lp_build_context *bld,
1988 LLVMValueRef a)
1989 {
1990 LLVMBuilderRef builder = bld->gallivm->builder;
1991 const struct lp_type type = bld->type;
1992
1993 assert(type.floating);
1994 assert(lp_check_value(type, a));
1995
1996 if (arch_rounding_available(type)) {
1997 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1998 }
1999 else {
2000 const struct lp_type type = bld->type;
2001 struct lp_type inttype;
2002 struct lp_build_context intbld;
2003 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2004 LLVMValueRef trunc, res, anosign, mask;
2005 LLVMTypeRef int_vec_type = bld->int_vec_type;
2006 LLVMTypeRef vec_type = bld->vec_type;
2007
2008 if (type.width != 32) {
2009 char intrinsic[32];
2010 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2011 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2012 }
2013
2014 assert(type.width == 32); /* might want to handle doubles at some point */
2015
2016 inttype = type;
2017 inttype.floating = 0;
2018 lp_build_context_init(&intbld, bld->gallivm, inttype);
2019
2020 /* round by truncation */
2021 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2022 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2023
2024 if (type.sign) {
2025 LLVMValueRef tmp;
2026
2027 /*
2028 * fix values if rounding is wrong (for non-special cases)
2029 * - this is the case if trunc > a
2030 */
2031 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2032 /* tmp = trunc > a ? 1.0 : 0.0 */
2033 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2034 tmp = lp_build_and(&intbld, mask, tmp);
2035 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2036 res = lp_build_sub(bld, res, tmp);
2037 }
2038
2039 /* mask out sign bit */
2040 anosign = lp_build_abs(bld, a);
2041 /*
2042 * mask out all values if anosign > 2^24
2043 * This should work both for large ints (all rounding is no-op for them
2044 * because such floats are always exact) as well as special cases like
2045 * NaNs, Infs (taking advantage of the fact they use max exponent).
2046 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2047 */
2048 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2049 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2050 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2051 return lp_build_select(bld, mask, a, res);
2052 }
2053 }
2054
2055
2056 /**
2057 * Return ceiling of float (vector), returning float (vector).
2058 * Ex: ceil( 1.1) = 2.0
2059 * Ex: ceil(-1.1) = -1.0
2060 */
2061 LLVMValueRef
2062 lp_build_ceil(struct lp_build_context *bld,
2063 LLVMValueRef a)
2064 {
2065 LLVMBuilderRef builder = bld->gallivm->builder;
2066 const struct lp_type type = bld->type;
2067
2068 assert(type.floating);
2069 assert(lp_check_value(type, a));
2070
2071 if (arch_rounding_available(type)) {
2072 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2073 }
2074 else {
2075 const struct lp_type type = bld->type;
2076 struct lp_type inttype;
2077 struct lp_build_context intbld;
2078 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2079 LLVMValueRef trunc, res, anosign, mask, tmp;
2080 LLVMTypeRef int_vec_type = bld->int_vec_type;
2081 LLVMTypeRef vec_type = bld->vec_type;
2082
2083 if (type.width != 32) {
2084 char intrinsic[32];
2085 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2086 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2087 }
2088
2089 assert(type.width == 32); /* might want to handle doubles at some point */
2090
2091 inttype = type;
2092 inttype.floating = 0;
2093 lp_build_context_init(&intbld, bld->gallivm, inttype);
2094
2095 /* round by truncation */
2096 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2097 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2098
2099 /*
2100 * fix values if rounding is wrong (for non-special cases)
2101 * - this is the case if trunc < a
2102 */
2103 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2104 /* tmp = trunc < a ? 1.0 : 0.0 */
2105 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2106 tmp = lp_build_and(&intbld, mask, tmp);
2107 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2108 res = lp_build_add(bld, trunc, tmp);
2109
2110 /* mask out sign bit */
2111 anosign = lp_build_abs(bld, a);
2112 /*
2113 * mask out all values if anosign > 2^24
2114 * This should work both for large ints (all rounding is no-op for them
2115 * because such floats are always exact) as well as special cases like
2116 * NaNs, Infs (taking advantage of the fact they use max exponent).
2117 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2118 */
2119 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2120 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2121 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2122 return lp_build_select(bld, mask, a, res);
2123 }
2124 }
2125
2126
2127 /**
2128 * Return fractional part of 'a' computed as a - floor(a)
2129 * Typically used in texture coord arithmetic.
2130 */
2131 LLVMValueRef
2132 lp_build_fract(struct lp_build_context *bld,
2133 LLVMValueRef a)
2134 {
2135 assert(bld->type.floating);
2136 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2137 }
2138
2139
2140 /**
2141 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2142 * against 0.99999(9). (Will also return that value for NaNs.)
2143 */
2144 static inline LLVMValueRef
2145 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2146 {
2147 LLVMValueRef max;
2148
2149 /* this is the largest number smaller than 1.0 representable as float */
2150 max = lp_build_const_vec(bld->gallivm, bld->type,
2151 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2152 return lp_build_min_ext(bld, fract, max,
2153 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2154 }
2155
2156
2157 /**
2158 * Same as lp_build_fract, but guarantees that the result is always smaller
2159 * than one. Will also return the smaller-than-one value for infs, NaNs.
2160 */
2161 LLVMValueRef
2162 lp_build_fract_safe(struct lp_build_context *bld,
2163 LLVMValueRef a)
2164 {
2165 return clamp_fract(bld, lp_build_fract(bld, a));
2166 }
2167
2168
2169 /**
2170 * Return the integer part of a float (vector) value (== round toward zero).
2171 * The returned value is an integer (vector).
2172 * Ex: itrunc(-1.5) = -1
2173 */
2174 LLVMValueRef
2175 lp_build_itrunc(struct lp_build_context *bld,
2176 LLVMValueRef a)
2177 {
2178 LLVMBuilderRef builder = bld->gallivm->builder;
2179 const struct lp_type type = bld->type;
2180 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2181
2182 assert(type.floating);
2183 assert(lp_check_value(type, a));
2184
2185 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2186 }
2187
2188
2189 /**
2190 * Return float (vector) rounded to nearest integer (vector). The returned
2191 * value is an integer (vector).
2192 * Ex: iround(0.9) = 1
2193 * Ex: iround(-1.5) = -2
2194 */
2195 LLVMValueRef
2196 lp_build_iround(struct lp_build_context *bld,
2197 LLVMValueRef a)
2198 {
2199 LLVMBuilderRef builder = bld->gallivm->builder;
2200 const struct lp_type type = bld->type;
2201 LLVMTypeRef int_vec_type = bld->int_vec_type;
2202 LLVMValueRef res;
2203
2204 assert(type.floating);
2205
2206 assert(lp_check_value(type, a));
2207
2208 if ((util_cpu_caps.has_sse2 &&
2209 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2210 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2211 return lp_build_iround_nearest_sse2(bld, a);
2212 }
2213 if (arch_rounding_available(type)) {
2214 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2215 }
2216 else {
2217 LLVMValueRef half;
2218
2219 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2220
2221 if (type.sign) {
2222 LLVMTypeRef vec_type = bld->vec_type;
2223 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2224 (unsigned long long)1 << (type.width - 1));
2225 LLVMValueRef sign;
2226
2227 /* get sign bit */
2228 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2229 sign = LLVMBuildAnd(builder, sign, mask, "");
2230
2231 /* sign * 0.5 */
2232 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2233 half = LLVMBuildOr(builder, sign, half, "");
2234 half = LLVMBuildBitCast(builder, half, vec_type, "");
2235 }
2236
2237 res = LLVMBuildFAdd(builder, a, half, "");
2238 }
2239
2240 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2241
2242 return res;
2243 }
2244
2245
2246 /**
2247 * Return floor of float (vector), result is an int (vector)
2248 * Ex: ifloor(1.1) = 1.0
2249 * Ex: ifloor(-1.1) = -2.0
2250 */
2251 LLVMValueRef
2252 lp_build_ifloor(struct lp_build_context *bld,
2253 LLVMValueRef a)
2254 {
2255 LLVMBuilderRef builder = bld->gallivm->builder;
2256 const struct lp_type type = bld->type;
2257 LLVMTypeRef int_vec_type = bld->int_vec_type;
2258 LLVMValueRef res;
2259
2260 assert(type.floating);
2261 assert(lp_check_value(type, a));
2262
2263 res = a;
2264 if (type.sign) {
2265 if (arch_rounding_available(type)) {
2266 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2267 }
2268 else {
2269 struct lp_type inttype;
2270 struct lp_build_context intbld;
2271 LLVMValueRef trunc, itrunc, mask;
2272
2273 assert(type.floating);
2274 assert(lp_check_value(type, a));
2275
2276 inttype = type;
2277 inttype.floating = 0;
2278 lp_build_context_init(&intbld, bld->gallivm, inttype);
2279
2280 /* round by truncation */
2281 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2282 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2283
2284 /*
2285 * fix values if rounding is wrong (for non-special cases)
2286 * - this is the case if trunc > a
2287 * The results of doing this with NaNs, very large values etc.
2288 * are undefined but this seems to be the case anyway.
2289 */
2290 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2291 /* cheapie minus one with mask since the mask is minus one / zero */
2292 return lp_build_add(&intbld, itrunc, mask);
2293 }
2294 }
2295
2296 /* round to nearest (toward zero) */
2297 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2298
2299 return res;
2300 }
2301
2302
2303 /**
2304 * Return ceiling of float (vector), returning int (vector).
2305 * Ex: iceil( 1.1) = 2
2306 * Ex: iceil(-1.1) = -1
2307 */
2308 LLVMValueRef
2309 lp_build_iceil(struct lp_build_context *bld,
2310 LLVMValueRef a)
2311 {
2312 LLVMBuilderRef builder = bld->gallivm->builder;
2313 const struct lp_type type = bld->type;
2314 LLVMTypeRef int_vec_type = bld->int_vec_type;
2315 LLVMValueRef res;
2316
2317 assert(type.floating);
2318 assert(lp_check_value(type, a));
2319
2320 if (arch_rounding_available(type)) {
2321 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2322 }
2323 else {
2324 struct lp_type inttype;
2325 struct lp_build_context intbld;
2326 LLVMValueRef trunc, itrunc, mask;
2327
2328 assert(type.floating);
2329 assert(lp_check_value(type, a));
2330
2331 inttype = type;
2332 inttype.floating = 0;
2333 lp_build_context_init(&intbld, bld->gallivm, inttype);
2334
2335 /* round by truncation */
2336 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2337 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2338
2339 /*
2340 * fix values if rounding is wrong (for non-special cases)
2341 * - this is the case if trunc < a
2342 * The results of doing this with NaNs, very large values etc.
2343 * are undefined but this seems to be the case anyway.
2344 */
2345 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2346 /* cheapie plus one with mask since the mask is minus one / zero */
2347 return lp_build_sub(&intbld, itrunc, mask);
2348 }
2349
2350 /* round to nearest (toward zero) */
2351 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2352
2353 return res;
2354 }
2355
2356
2357 /**
2358 * Combined ifloor() & fract().
2359 *
2360 * Preferred to calling the functions separately, as it will ensure that the
2361 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2362 */
2363 void
2364 lp_build_ifloor_fract(struct lp_build_context *bld,
2365 LLVMValueRef a,
2366 LLVMValueRef *out_ipart,
2367 LLVMValueRef *out_fpart)
2368 {
2369 LLVMBuilderRef builder = bld->gallivm->builder;
2370 const struct lp_type type = bld->type;
2371 LLVMValueRef ipart;
2372
2373 assert(type.floating);
2374 assert(lp_check_value(type, a));
2375
2376 if (arch_rounding_available(type)) {
2377 /*
2378 * floor() is easier.
2379 */
2380
2381 ipart = lp_build_floor(bld, a);
2382 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2383 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2384 }
2385 else {
2386 /*
2387 * ifloor() is easier.
2388 */
2389
2390 *out_ipart = lp_build_ifloor(bld, a);
2391 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2392 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2393 }
2394 }
2395
2396
2397 /**
2398 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2399 * always smaller than one.
2400 */
2401 void
2402 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2403 LLVMValueRef a,
2404 LLVMValueRef *out_ipart,
2405 LLVMValueRef *out_fpart)
2406 {
2407 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2408 *out_fpart = clamp_fract(bld, *out_fpart);
2409 }
2410
2411
2412 LLVMValueRef
2413 lp_build_sqrt(struct lp_build_context *bld,
2414 LLVMValueRef a)
2415 {
2416 LLVMBuilderRef builder = bld->gallivm->builder;
2417 const struct lp_type type = bld->type;
2418 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2419 char intrinsic[32];
2420
2421 assert(lp_check_value(type, a));
2422
2423 assert(type.floating);
2424 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2425
2426 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2427 }
2428
2429
2430 /**
2431 * Do one Newton-Raphson step to improve reciprocate precision:
2432 *
2433 * x_{i+1} = x_i * (2 - a * x_i)
2434 *
2435 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2436 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2437 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2438 * halo. It would be necessary to clamp the argument to prevent this.
2439 *
2440 * See also:
2441 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2442 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2443 */
2444 static inline LLVMValueRef
2445 lp_build_rcp_refine(struct lp_build_context *bld,
2446 LLVMValueRef a,
2447 LLVMValueRef rcp_a)
2448 {
2449 LLVMBuilderRef builder = bld->gallivm->builder;
2450 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2451 LLVMValueRef res;
2452
2453 res = LLVMBuildFMul(builder, a, rcp_a, "");
2454 res = LLVMBuildFSub(builder, two, res, "");
2455 res = LLVMBuildFMul(builder, rcp_a, res, "");
2456
2457 return res;
2458 }
2459
2460
2461 LLVMValueRef
2462 lp_build_rcp(struct lp_build_context *bld,
2463 LLVMValueRef a)
2464 {
2465 LLVMBuilderRef builder = bld->gallivm->builder;
2466 const struct lp_type type = bld->type;
2467
2468 assert(lp_check_value(type, a));
2469
2470 if(a == bld->zero)
2471 return bld->undef;
2472 if(a == bld->one)
2473 return bld->one;
2474 if(a == bld->undef)
2475 return bld->undef;
2476
2477 assert(type.floating);
2478
2479 if(LLVMIsConstant(a))
2480 return LLVMConstFDiv(bld->one, a);
2481
2482 /*
2483 * We don't use RCPPS because:
2484 * - it only has 10bits of precision
2485 * - it doesn't even get the reciprocate of 1.0 exactly
2486 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2487 * - for recent processors the benefit over DIVPS is marginal, a case
2488 * dependent
2489 *
2490 * We could still use it on certain processors if benchmarks show that the
2491 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2492 * particular uses that require less workarounds.
2493 */
2494
2495 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2496 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2497 const unsigned num_iterations = 0;
2498 LLVMValueRef res;
2499 unsigned i;
2500 const char *intrinsic = NULL;
2501
2502 if (type.length == 4) {
2503 intrinsic = "llvm.x86.sse.rcp.ps";
2504 }
2505 else {
2506 intrinsic = "llvm.x86.avx.rcp.ps.256";
2507 }
2508
2509 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2510
2511 for (i = 0; i < num_iterations; ++i) {
2512 res = lp_build_rcp_refine(bld, a, res);
2513 }
2514
2515 return res;
2516 }
2517
2518 return LLVMBuildFDiv(builder, bld->one, a, "");
2519 }
2520
2521
2522 /**
2523 * Do one Newton-Raphson step to improve rsqrt precision:
2524 *
2525 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2526 *
2527 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2528 */
2529 static inline LLVMValueRef
2530 lp_build_rsqrt_refine(struct lp_build_context *bld,
2531 LLVMValueRef a,
2532 LLVMValueRef rsqrt_a)
2533 {
2534 LLVMBuilderRef builder = bld->gallivm->builder;
2535 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2536 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2537 LLVMValueRef res;
2538
2539 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2540 res = LLVMBuildFMul(builder, a, res, "");
2541 res = LLVMBuildFSub(builder, three, res, "");
2542 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2543 res = LLVMBuildFMul(builder, half, res, "");
2544
2545 return res;
2546 }
2547
2548
2549 /**
2550 * Generate 1/sqrt(a).
2551 * Result is undefined for values < 0, infinity for +0.
2552 */
2553 LLVMValueRef
2554 lp_build_rsqrt(struct lp_build_context *bld,
2555 LLVMValueRef a)
2556 {
2557 const struct lp_type type = bld->type;
2558
2559 assert(lp_check_value(type, a));
2560
2561 assert(type.floating);
2562
2563 /*
2564 * This should be faster but all denormals will end up as infinity.
2565 */
2566 if (0 && lp_build_fast_rsqrt_available(type)) {
2567 const unsigned num_iterations = 1;
2568 LLVMValueRef res;
2569 unsigned i;
2570
2571 /* rsqrt(1.0) != 1.0 here */
2572 res = lp_build_fast_rsqrt(bld, a);
2573
2574 if (num_iterations) {
2575 /*
2576 * Newton-Raphson will result in NaN instead of infinity for zero,
2577 * and NaN instead of zero for infinity.
2578 * Also, need to ensure rsqrt(1.0) == 1.0.
2579 * All numbers smaller than FLT_MIN will result in +infinity
2580 * (rsqrtps treats all denormals as zero).
2581 */
2582 LLVMValueRef cmp;
2583 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2584 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2585
2586 for (i = 0; i < num_iterations; ++i) {
2587 res = lp_build_rsqrt_refine(bld, a, res);
2588 }
2589 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2590 res = lp_build_select(bld, cmp, inf, res);
2591 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2592 res = lp_build_select(bld, cmp, bld->zero, res);
2593 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2594 res = lp_build_select(bld, cmp, bld->one, res);
2595 }
2596
2597 return res;
2598 }
2599
2600 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2601 }
2602
2603 /**
2604 * If there's a fast (inaccurate) rsqrt instruction available
2605 * (caller may want to avoid to call rsqrt_fast if it's not available,
2606 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2607 * unavailable it would result in sqrt/div/mul so obviously
2608 * much better to just call sqrt, skipping both div and mul).
2609 */
2610 boolean
2611 lp_build_fast_rsqrt_available(struct lp_type type)
2612 {
2613 assert(type.floating);
2614
2615 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2616 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2617 return true;
2618 }
2619 return false;
2620 }
2621
2622
2623 /**
2624 * Generate 1/sqrt(a).
2625 * Result is undefined for values < 0, infinity for +0.
2626 * Precision is limited, only ~10 bits guaranteed
2627 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2628 */
2629 LLVMValueRef
2630 lp_build_fast_rsqrt(struct lp_build_context *bld,
2631 LLVMValueRef a)
2632 {
2633 LLVMBuilderRef builder = bld->gallivm->builder;
2634 const struct lp_type type = bld->type;
2635
2636 assert(lp_check_value(type, a));
2637
2638 if (lp_build_fast_rsqrt_available(type)) {
2639 const char *intrinsic = NULL;
2640
2641 if (type.length == 4) {
2642 intrinsic = "llvm.x86.sse.rsqrt.ps";
2643 }
2644 else {
2645 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2646 }
2647 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2648 }
2649 else {
2650 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2651 }
2652 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2653 }
2654
2655
2656 /**
2657 * Generate sin(a) or cos(a) using polynomial approximation.
2658 * TODO: it might be worth recognizing sin and cos using same source
2659 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2660 * would be way cheaper than calculating (nearly) everything twice...
2661 * Not sure it's common enough to be worth bothering however, scs
2662 * opcode could also benefit from calculating both though.
2663 */
2664 static LLVMValueRef
2665 lp_build_sin_or_cos(struct lp_build_context *bld,
2666 LLVMValueRef a,
2667 boolean cos)
2668 {
2669 struct gallivm_state *gallivm = bld->gallivm;
2670 LLVMBuilderRef b = gallivm->builder;
2671 struct lp_type int_type = lp_int_type(bld->type);
2672
2673 /*
2674 * take the absolute value,
2675 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2676 */
2677
2678 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2679 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2680
2681 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2682 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2683
2684 /*
2685 * scale by 4/Pi
2686 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2687 */
2688
2689 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2690 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2691
2692 /*
2693 * store the integer part of y in mm0
2694 * emm2 = _mm_cvttps_epi32(y);
2695 */
2696
2697 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2698
2699 /*
2700 * j=(j+1) & (~1) (see the cephes sources)
2701 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2702 */
2703
2704 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2705 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2706 /*
2707 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2708 */
2709 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2710 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2711
2712 /*
2713 * y = _mm_cvtepi32_ps(emm2);
2714 */
2715 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2716
2717 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2718 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2719 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2720 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2721
2722 /*
2723 * Argument used for poly selection and sign bit determination
2724 * is different for sin vs. cos.
2725 */
2726 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2727 emm2_and;
2728
2729 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2730 LLVMBuildNot(b, emm2_2, ""), ""),
2731 const_29, "sign_bit") :
2732 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2733 LLVMBuildShl(b, emm2_add,
2734 const_29, ""), ""),
2735 sign_mask, "sign_bit");
2736
2737 /*
2738 * get the polynom selection mask
2739 * there is one polynom for 0 <= x <= Pi/4
2740 * and another one for Pi/4<x<=Pi/2
2741 * Both branches will be computed.
2742 *
2743 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2744 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2745 */
2746
2747 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2748 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2749 int_type, PIPE_FUNC_EQUAL,
2750 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2751
2752 /*
2753 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2754 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2755 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2756 */
2757 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2758 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2759 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2760
2761 /*
2762 * The magic pass: "Extended precision modular arithmetic"
2763 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2764 */
2765 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2766 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2767 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2768
2769 /*
2770 * Evaluate the first polynom (0 <= x <= Pi/4)
2771 *
2772 * z = _mm_mul_ps(x,x);
2773 */
2774 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2775
2776 /*
2777 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2778 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2779 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2780 */
2781 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2782 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2783 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2784
2785 /*
2786 * y = *(v4sf*)_ps_coscof_p0;
2787 * y = _mm_mul_ps(y, z);
2788 */
2789 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2790 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2791 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2792 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2793
2794
2795 /*
2796 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2797 * y = _mm_sub_ps(y, tmp);
2798 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2799 */
2800 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2801 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2802 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2803 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2804 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2805
2806 /*
2807 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2808 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2809 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2810 */
2811 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2812 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2813 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2814
2815 /*
2816 * Evaluate the second polynom (Pi/4 <= x <= 0)
2817 *
2818 * y2 = *(v4sf*)_ps_sincof_p0;
2819 * y2 = _mm_mul_ps(y2, z);
2820 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2821 * y2 = _mm_mul_ps(y2, z);
2822 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2823 * y2 = _mm_mul_ps(y2, z);
2824 * y2 = _mm_mul_ps(y2, x);
2825 * y2 = _mm_add_ps(y2, x);
2826 */
2827
2828 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2829 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2830 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2831 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2832
2833 /*
2834 * select the correct result from the two polynoms
2835 * xmm3 = poly_mask;
2836 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2837 * y = _mm_andnot_ps(xmm3, y);
2838 * y = _mm_or_ps(y,y2);
2839 */
2840 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2841 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2842 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2843 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2844 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2845 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2846
2847 /*
2848 * update the sign
2849 * y = _mm_xor_ps(y, sign_bit);
2850 */
2851 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2852 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2853
2854 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2855
2856 /* clamp output to be within [-1, 1] */
2857 y_result = lp_build_clamp(bld, y_result,
2858 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2859 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2860 /* If a is -inf, inf or NaN then return NaN */
2861 y_result = lp_build_select(bld, isfinite, y_result,
2862 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2863 return y_result;
2864 }
2865
2866
2867 /**
2868 * Generate sin(a)
2869 */
2870 LLVMValueRef
2871 lp_build_sin(struct lp_build_context *bld,
2872 LLVMValueRef a)
2873 {
2874 return lp_build_sin_or_cos(bld, a, FALSE);
2875 }
2876
2877
2878 /**
2879 * Generate cos(a)
2880 */
2881 LLVMValueRef
2882 lp_build_cos(struct lp_build_context *bld,
2883 LLVMValueRef a)
2884 {
2885 return lp_build_sin_or_cos(bld, a, TRUE);
2886 }
2887
2888
2889 /**
2890 * Generate pow(x, y)
2891 */
2892 LLVMValueRef
2893 lp_build_pow(struct lp_build_context *bld,
2894 LLVMValueRef x,
2895 LLVMValueRef y)
2896 {
2897 /* TODO: optimize the constant case */
2898 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2899 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2900 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2901 __FUNCTION__);
2902 }
2903
2904 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2905 }
2906
2907
2908 /**
2909 * Generate exp(x)
2910 */
2911 LLVMValueRef
2912 lp_build_exp(struct lp_build_context *bld,
2913 LLVMValueRef x)
2914 {
2915 /* log2(e) = 1/log(2) */
2916 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2917 1.4426950408889634);
2918
2919 assert(lp_check_value(bld->type, x));
2920
2921 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2922 }
2923
2924
2925 /**
2926 * Generate log(x)
2927 * Behavior is undefined with infs, 0s and nans
2928 */
2929 LLVMValueRef
2930 lp_build_log(struct lp_build_context *bld,
2931 LLVMValueRef x)
2932 {
2933 /* log(2) */
2934 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2935 0.69314718055994529);
2936
2937 assert(lp_check_value(bld->type, x));
2938
2939 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2940 }
2941
2942 /**
2943 * Generate log(x) that handles edge cases (infs, 0s and nans)
2944 */
2945 LLVMValueRef
2946 lp_build_log_safe(struct lp_build_context *bld,
2947 LLVMValueRef x)
2948 {
2949 /* log(2) */
2950 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2951 0.69314718055994529);
2952
2953 assert(lp_check_value(bld->type, x));
2954
2955 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2956 }
2957
2958
2959 /**
2960 * Generate polynomial.
2961 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2962 */
2963 LLVMValueRef
2964 lp_build_polynomial(struct lp_build_context *bld,
2965 LLVMValueRef x,
2966 const double *coeffs,
2967 unsigned num_coeffs)
2968 {
2969 const struct lp_type type = bld->type;
2970 LLVMValueRef even = NULL, odd = NULL;
2971 LLVMValueRef x2;
2972 unsigned i;
2973
2974 assert(lp_check_value(bld->type, x));
2975
2976 /* TODO: optimize the constant case */
2977 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2978 LLVMIsConstant(x)) {
2979 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2980 __FUNCTION__);
2981 }
2982
2983 /*
2984 * Calculate odd and even terms seperately to decrease data dependency
2985 * Ex:
2986 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2987 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2988 */
2989 x2 = lp_build_mul(bld, x, x);
2990
2991 for (i = num_coeffs; i--; ) {
2992 LLVMValueRef coeff;
2993
2994 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2995
2996 if (i % 2 == 0) {
2997 if (even)
2998 even = lp_build_mad(bld, x2, even, coeff);
2999 else
3000 even = coeff;
3001 } else {
3002 if (odd)
3003 odd = lp_build_mad(bld, x2, odd, coeff);
3004 else
3005 odd = coeff;
3006 }
3007 }
3008
3009 if (odd)
3010 return lp_build_mad(bld, odd, x, even);
3011 else if (even)
3012 return even;
3013 else
3014 return bld->undef;
3015 }
3016
3017
3018 /**
3019 * Minimax polynomial fit of 2**x, in range [0, 1[
3020 */
3021 const double lp_build_exp2_polynomial[] = {
3022 #if EXP_POLY_DEGREE == 5
3023 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3024 0.693153073200168932794,
3025 0.240153617044375388211,
3026 0.0558263180532956664775,
3027 0.00898934009049466391101,
3028 0.00187757667519147912699
3029 #elif EXP_POLY_DEGREE == 4
3030 1.00000259337069434683,
3031 0.693003834469974940458,
3032 0.24144275689150793076,
3033 0.0520114606103070150235,
3034 0.0135341679161270268764
3035 #elif EXP_POLY_DEGREE == 3
3036 0.999925218562710312959,
3037 0.695833540494823811697,
3038 0.226067155427249155588,
3039 0.0780245226406372992967
3040 #elif EXP_POLY_DEGREE == 2
3041 1.00172476321474503578,
3042 0.657636275736077639316,
3043 0.33718943461968720704
3044 #else
3045 #error
3046 #endif
3047 };
3048
3049
3050 LLVMValueRef
3051 lp_build_exp2(struct lp_build_context *bld,
3052 LLVMValueRef x)
3053 {
3054 LLVMBuilderRef builder = bld->gallivm->builder;
3055 const struct lp_type type = bld->type;
3056 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3057 LLVMValueRef ipart = NULL;
3058 LLVMValueRef fpart = NULL;
3059 LLVMValueRef expipart = NULL;
3060 LLVMValueRef expfpart = NULL;
3061 LLVMValueRef res = NULL;
3062
3063 assert(lp_check_value(bld->type, x));
3064
3065 /* TODO: optimize the constant case */
3066 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3067 LLVMIsConstant(x)) {
3068 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3069 __FUNCTION__);
3070 }
3071
3072 assert(type.floating && type.width == 32);
3073
3074 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3075 * the result is INF and if it's smaller than -126.9 the result is 0 */
3076 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3077 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3078 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3079 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3080
3081 /* ipart = floor(x) */
3082 /* fpart = x - ipart */
3083 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3084
3085 /* expipart = (float) (1 << ipart) */
3086 expipart = LLVMBuildAdd(builder, ipart,
3087 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3088 expipart = LLVMBuildShl(builder, expipart,
3089 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3090 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3091
3092 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3093 ARRAY_SIZE(lp_build_exp2_polynomial));
3094
3095 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3096
3097 return res;
3098 }
3099
3100
3101
3102 /**
3103 * Extract the exponent of a IEEE-754 floating point value.
3104 *
3105 * Optionally apply an integer bias.
3106 *
3107 * Result is an integer value with
3108 *
3109 * ifloor(log2(x)) + bias
3110 */
3111 LLVMValueRef
3112 lp_build_extract_exponent(struct lp_build_context *bld,
3113 LLVMValueRef x,
3114 int bias)
3115 {
3116 LLVMBuilderRef builder = bld->gallivm->builder;
3117 const struct lp_type type = bld->type;
3118 unsigned mantissa = lp_mantissa(type);
3119 LLVMValueRef res;
3120
3121 assert(type.floating);
3122
3123 assert(lp_check_value(bld->type, x));
3124
3125 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3126
3127 res = LLVMBuildLShr(builder, x,
3128 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3129 res = LLVMBuildAnd(builder, res,
3130 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3131 res = LLVMBuildSub(builder, res,
3132 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3133
3134 return res;
3135 }
3136
3137
3138 /**
3139 * Extract the mantissa of the a floating.
3140 *
3141 * Result is a floating point value with
3142 *
3143 * x / floor(log2(x))
3144 */
3145 LLVMValueRef
3146 lp_build_extract_mantissa(struct lp_build_context *bld,
3147 LLVMValueRef x)
3148 {
3149 LLVMBuilderRef builder = bld->gallivm->builder;
3150 const struct lp_type type = bld->type;
3151 unsigned mantissa = lp_mantissa(type);
3152 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3153 (1ULL << mantissa) - 1);
3154 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3155 LLVMValueRef res;
3156
3157 assert(lp_check_value(bld->type, x));
3158
3159 assert(type.floating);
3160
3161 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3162
3163 /* res = x / 2**ipart */
3164 res = LLVMBuildAnd(builder, x, mantmask, "");
3165 res = LLVMBuildOr(builder, res, one, "");
3166 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3167
3168 return res;
3169 }
3170
3171
3172
3173 /**
3174 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3175 * These coefficients can be generate with
3176 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3177 */
3178 const double lp_build_log2_polynomial[] = {
3179 #if LOG_POLY_DEGREE == 5
3180 2.88539008148777786488L,
3181 0.961796878841293367824L,
3182 0.577058946784739859012L,
3183 0.412914355135828735411L,
3184 0.308591899232910175289L,
3185 0.352376952300281371868L,
3186 #elif LOG_POLY_DEGREE == 4
3187 2.88539009343309178325L,
3188 0.961791550404184197881L,
3189 0.577440339438736392009L,
3190 0.403343858251329912514L,
3191 0.406718052498846252698L,
3192 #elif LOG_POLY_DEGREE == 3
3193 2.88538959748872753838L,
3194 0.961932915889597772928L,
3195 0.571118517972136195241L,
3196 0.493997535084709500285L,
3197 #else
3198 #error
3199 #endif
3200 };
3201
3202 /**
3203 * See http://www.devmaster.net/forums/showthread.php?p=43580
3204 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3205 * http://www.nezumi.demon.co.uk/consult/logx.htm
3206 *
3207 * If handle_edge_cases is true the function will perform computations
3208 * to match the required D3D10+ behavior for each of the edge cases.
3209 * That means that if input is:
3210 * - less than zero (to and including -inf) then NaN will be returned
3211 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3212 * - +infinity, then +infinity will be returned
3213 * - NaN, then NaN will be returned
3214 *
3215 * Those checks are fairly expensive so if you don't need them make sure
3216 * handle_edge_cases is false.
3217 */
3218 void
3219 lp_build_log2_approx(struct lp_build_context *bld,
3220 LLVMValueRef x,
3221 LLVMValueRef *p_exp,
3222 LLVMValueRef *p_floor_log2,
3223 LLVMValueRef *p_log2,
3224 boolean handle_edge_cases)
3225 {
3226 LLVMBuilderRef builder = bld->gallivm->builder;
3227 const struct lp_type type = bld->type;
3228 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3229 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3230
3231 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3232 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3233 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3234
3235 LLVMValueRef i = NULL;
3236 LLVMValueRef y = NULL;
3237 LLVMValueRef z = NULL;
3238 LLVMValueRef exp = NULL;
3239 LLVMValueRef mant = NULL;
3240 LLVMValueRef logexp = NULL;
3241 LLVMValueRef p_z = NULL;
3242 LLVMValueRef res = NULL;
3243
3244 assert(lp_check_value(bld->type, x));
3245
3246 if(p_exp || p_floor_log2 || p_log2) {
3247 /* TODO: optimize the constant case */
3248 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3249 LLVMIsConstant(x)) {
3250 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3251 __FUNCTION__);
3252 }
3253
3254 assert(type.floating && type.width == 32);
3255
3256 /*
3257 * We don't explicitly handle denormalized numbers. They will yield a
3258 * result in the neighbourhood of -127, which appears to be adequate
3259 * enough.
3260 */
3261
3262 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3263
3264 /* exp = (float) exponent(x) */
3265 exp = LLVMBuildAnd(builder, i, expmask, "");
3266 }
3267
3268 if(p_floor_log2 || p_log2) {
3269 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3270 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3271 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3272 }
3273
3274 if (p_log2) {
3275 /* mant = 1 + (float) mantissa(x) */
3276 mant = LLVMBuildAnd(builder, i, mantmask, "");
3277 mant = LLVMBuildOr(builder, mant, one, "");
3278 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3279
3280 /* y = (mant - 1) / (mant + 1) */
3281 y = lp_build_div(bld,
3282 lp_build_sub(bld, mant, bld->one),
3283 lp_build_add(bld, mant, bld->one)
3284 );
3285
3286 /* z = y^2 */
3287 z = lp_build_mul(bld, y, y);
3288
3289 /* compute P(z) */
3290 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3291 ARRAY_SIZE(lp_build_log2_polynomial));
3292
3293 /* y * P(z) + logexp */
3294 res = lp_build_mad(bld, y, p_z, logexp);
3295
3296 if (type.floating && handle_edge_cases) {
3297 LLVMValueRef negmask, infmask, zmask;
3298 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3299 lp_build_const_vec(bld->gallivm, type, 0.0f));
3300 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3301 lp_build_const_vec(bld->gallivm, type, 0.0f));
3302 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3303 lp_build_const_vec(bld->gallivm, type, INFINITY));
3304
3305 /* If x is qual to inf make sure we return inf */
3306 res = lp_build_select(bld, infmask,
3307 lp_build_const_vec(bld->gallivm, type, INFINITY),
3308 res);
3309 /* If x is qual to 0, return -inf */
3310 res = lp_build_select(bld, zmask,
3311 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3312 res);
3313 /* If x is nan or less than 0, return nan */
3314 res = lp_build_select(bld, negmask,
3315 lp_build_const_vec(bld->gallivm, type, NAN),
3316 res);
3317 }
3318 }
3319
3320 if (p_exp) {
3321 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3322 *p_exp = exp;
3323 }
3324
3325 if (p_floor_log2)
3326 *p_floor_log2 = logexp;
3327
3328 if (p_log2)
3329 *p_log2 = res;
3330 }
3331
3332
3333 /*
3334 * log2 implementation which doesn't have special code to
3335 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3336 * the results for those cases are undefined.
3337 */
3338 LLVMValueRef
3339 lp_build_log2(struct lp_build_context *bld,
3340 LLVMValueRef x)
3341 {
3342 LLVMValueRef res;
3343 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3344 return res;
3345 }
3346
3347 /*
3348 * Version of log2 which handles all edge cases.
3349 * Look at documentation of lp_build_log2_approx for
3350 * description of the behavior for each of the edge cases.
3351 */
3352 LLVMValueRef
3353 lp_build_log2_safe(struct lp_build_context *bld,
3354 LLVMValueRef x)
3355 {
3356 LLVMValueRef res;
3357 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3358 return res;
3359 }
3360
3361
3362 /**
3363 * Faster (and less accurate) log2.
3364 *
3365 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3366 *
3367 * Piece-wise linear approximation, with exact results when x is a
3368 * power of two.
3369 *
3370 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3371 */
3372 LLVMValueRef
3373 lp_build_fast_log2(struct lp_build_context *bld,
3374 LLVMValueRef x)
3375 {
3376 LLVMBuilderRef builder = bld->gallivm->builder;
3377 LLVMValueRef ipart;
3378 LLVMValueRef fpart;
3379
3380 assert(lp_check_value(bld->type, x));
3381
3382 assert(bld->type.floating);
3383
3384 /* ipart = floor(log2(x)) - 1 */
3385 ipart = lp_build_extract_exponent(bld, x, -1);
3386 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3387
3388 /* fpart = x / 2**ipart */
3389 fpart = lp_build_extract_mantissa(bld, x);
3390
3391 /* ipart + fpart */
3392 return LLVMBuildFAdd(builder, ipart, fpart, "");
3393 }
3394
3395
3396 /**
3397 * Fast implementation of iround(log2(x)).
3398 *
3399 * Not an approximation -- it should give accurate results all the time.
3400 */
3401 LLVMValueRef
3402 lp_build_ilog2(struct lp_build_context *bld,
3403 LLVMValueRef x)
3404 {
3405 LLVMBuilderRef builder = bld->gallivm->builder;
3406 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3407 LLVMValueRef ipart;
3408
3409 assert(bld->type.floating);
3410
3411 assert(lp_check_value(bld->type, x));
3412
3413 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3414 x = LLVMBuildFMul(builder, x, sqrt2, "");
3415
3416 /* ipart = floor(log2(x) + 0.5) */
3417 ipart = lp_build_extract_exponent(bld, x, 0);
3418
3419 return ipart;
3420 }
3421
3422 LLVMValueRef
3423 lp_build_mod(struct lp_build_context *bld,
3424 LLVMValueRef x,
3425 LLVMValueRef y)
3426 {
3427 LLVMBuilderRef builder = bld->gallivm->builder;
3428 LLVMValueRef res;
3429 const struct lp_type type = bld->type;
3430
3431 assert(lp_check_value(type, x));
3432 assert(lp_check_value(type, y));
3433
3434 if (type.floating)
3435 res = LLVMBuildFRem(builder, x, y, "");
3436 else if (type.sign)
3437 res = LLVMBuildSRem(builder, x, y, "");
3438 else
3439 res = LLVMBuildURem(builder, x, y, "");
3440 return res;
3441 }
3442
3443
3444 /*
3445 * For floating inputs it creates and returns a mask
3446 * which is all 1's for channels which are NaN.
3447 * Channels inside x which are not NaN will be 0.
3448 */
3449 LLVMValueRef
3450 lp_build_isnan(struct lp_build_context *bld,
3451 LLVMValueRef x)
3452 {
3453 LLVMValueRef mask;
3454 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3455
3456 assert(bld->type.floating);
3457 assert(lp_check_value(bld->type, x));
3458
3459 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3460 "isnotnan");
3461 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3462 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3463 return mask;
3464 }
3465
3466 /* Returns all 1's for floating point numbers that are
3467 * finite numbers and returns all zeros for -inf,
3468 * inf and nan's */
3469 LLVMValueRef
3470 lp_build_isfinite(struct lp_build_context *bld,
3471 LLVMValueRef x)
3472 {
3473 LLVMBuilderRef builder = bld->gallivm->builder;
3474 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3475 struct lp_type int_type = lp_int_type(bld->type);
3476 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3477 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3478 0x7f800000);
3479
3480 if (!bld->type.floating) {
3481 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3482 }
3483 assert(bld->type.floating);
3484 assert(lp_check_value(bld->type, x));
3485 assert(bld->type.width == 32);
3486
3487 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3488 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3489 intx, infornan32);
3490 }
3491
3492 /*
3493 * Returns true if the number is nan or inf and false otherwise.
3494 * The input has to be a floating point vector.
3495 */
3496 LLVMValueRef
3497 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3498 const struct lp_type type,
3499 LLVMValueRef x)
3500 {
3501 LLVMBuilderRef builder = gallivm->builder;
3502 struct lp_type int_type = lp_int_type(type);
3503 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3504 0x7f800000);
3505 LLVMValueRef ret;
3506
3507 assert(type.floating);
3508
3509 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3510 ret = LLVMBuildAnd(builder, ret, const0, "");
3511 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3512 ret, const0);
3513
3514 return ret;
3515 }
3516
3517
3518 LLVMValueRef
3519 lp_build_fpstate_get(struct gallivm_state *gallivm)
3520 {
3521 if (util_cpu_caps.has_sse) {
3522 LLVMBuilderRef builder = gallivm->builder;
3523 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3524 gallivm,
3525 LLVMInt32TypeInContext(gallivm->context),
3526 "mxcsr_ptr");
3527 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3528 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3529 lp_build_intrinsic(builder,
3530 "llvm.x86.sse.stmxcsr",
3531 LLVMVoidTypeInContext(gallivm->context),
3532 &mxcsr_ptr8, 1, 0);
3533 return mxcsr_ptr;
3534 }
3535 return 0;
3536 }
3537
3538 void
3539 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3540 boolean zero)
3541 {
3542 if (util_cpu_caps.has_sse) {
3543 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3544 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3545
3546 LLVMBuilderRef builder = gallivm->builder;
3547 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3548 LLVMValueRef mxcsr =
3549 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3550
3551 if (util_cpu_caps.has_daz) {
3552 /* Enable denormals are zero mode */
3553 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3554 }
3555 if (zero) {
3556 mxcsr = LLVMBuildOr(builder, mxcsr,
3557 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3558 } else {
3559 mxcsr = LLVMBuildAnd(builder, mxcsr,
3560 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3561 }
3562
3563 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3564 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3565 }
3566 }
3567
3568 void
3569 lp_build_fpstate_set(struct gallivm_state *gallivm,
3570 LLVMValueRef mxcsr_ptr)
3571 {
3572 if (util_cpu_caps.has_sse) {
3573 LLVMBuilderRef builder = gallivm->builder;
3574 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3575 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3576 lp_build_intrinsic(builder,
3577 "llvm.x86.sse.ldmxcsr",
3578 LLVMVoidTypeInContext(gallivm->context),
3579 &mxcsr_ptr, 1, 0);
3580 }
3581 }