gallivm: Fix saturated signed psub/padd intrinsics on llvm 8
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_cpu_caps.has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_cpu_caps.has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_cpu_caps.has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_cpu_caps.has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
139 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
140 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 __FUNCTION__);
142 }
143 if (type.width == 32 && type.length == 4) {
144 intrinsic = "llvm.ppc.altivec.vminfp";
145 intr_size = 128;
146 }
147 } else if (util_cpu_caps.has_altivec) {
148 intr_size = 128;
149 if (type.width == 8) {
150 if (!type.sign) {
151 intrinsic = "llvm.ppc.altivec.vminub";
152 } else {
153 intrinsic = "llvm.ppc.altivec.vminsb";
154 }
155 } else if (type.width == 16) {
156 if (!type.sign) {
157 intrinsic = "llvm.ppc.altivec.vminuh";
158 } else {
159 intrinsic = "llvm.ppc.altivec.vminsh";
160 }
161 } else if (type.width == 32) {
162 if (!type.sign) {
163 intrinsic = "llvm.ppc.altivec.vminuw";
164 } else {
165 intrinsic = "llvm.ppc.altivec.vminsw";
166 }
167 }
168 }
169
170 if (intrinsic) {
171 /* We need to handle nan's for floating point numbers. If one of the
172 * inputs is nan the other should be returned (required by both D3D10+
173 * and OpenCL).
174 * The sse intrinsics return the second operator in case of nan by
175 * default so we need to special code to handle those.
176 */
177 if (util_cpu_caps.has_sse && type.floating &&
178 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
179 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
180 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
181 LLVMValueRef isnan, min;
182 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
183 type,
184 intr_size, a, b);
185 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
186 isnan = lp_build_isnan(bld, b);
187 return lp_build_select(bld, isnan, a, min);
188 } else {
189 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
190 isnan = lp_build_isnan(bld, a);
191 return lp_build_select(bld, isnan, a, min);
192 }
193 } else {
194 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
195 type,
196 intr_size, a, b);
197 }
198 }
199
200 if (type.floating) {
201 switch (nan_behavior) {
202 case GALLIVM_NAN_RETURN_NAN: {
203 LLVMValueRef isnan = lp_build_isnan(bld, b);
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
205 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
206 return lp_build_select(bld, cond, a, b);
207 }
208 break;
209 case GALLIVM_NAN_RETURN_OTHER: {
210 LLVMValueRef isnan = lp_build_isnan(bld, a);
211 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
212 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
213 return lp_build_select(bld, cond, a, b);
214 }
215 break;
216 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
217 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
218 return lp_build_select(bld, cond, a, b);
219 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
220 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
221 return lp_build_select(bld, cond, b, a);
222 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
223 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
224 return lp_build_select(bld, cond, a, b);
225 break;
226 default:
227 assert(0);
228 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
229 return lp_build_select(bld, cond, a, b);
230 }
231 } else {
232 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
233 return lp_build_select(bld, cond, a, b);
234 }
235 }
236
237
238 LLVMValueRef
239 lp_build_fmuladd(LLVMBuilderRef builder,
240 LLVMValueRef a,
241 LLVMValueRef b,
242 LLVMValueRef c)
243 {
244 LLVMTypeRef type = LLVMTypeOf(a);
245 assert(type == LLVMTypeOf(b));
246 assert(type == LLVMTypeOf(c));
247
248 char intrinsic[32];
249 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
250 LLVMValueRef args[] = { a, b, c };
251 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
252 }
253
254
255 /**
256 * Generate max(a, b)
257 * No checks for special case values of a or b = 1 or 0 are done.
258 * NaN's are handled according to the behavior specified by the
259 * nan_behavior argument.
260 */
261 static LLVMValueRef
262 lp_build_max_simple(struct lp_build_context *bld,
263 LLVMValueRef a,
264 LLVMValueRef b,
265 enum gallivm_nan_behavior nan_behavior)
266 {
267 const struct lp_type type = bld->type;
268 const char *intrinsic = NULL;
269 unsigned intr_size = 0;
270 LLVMValueRef cond;
271
272 assert(lp_check_value(type, a));
273 assert(lp_check_value(type, b));
274
275 /* TODO: optimize the constant case */
276
277 if (type.floating && util_cpu_caps.has_sse) {
278 if (type.width == 32) {
279 if (type.length == 1) {
280 intrinsic = "llvm.x86.sse.max.ss";
281 intr_size = 128;
282 }
283 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
284 intrinsic = "llvm.x86.sse.max.ps";
285 intr_size = 128;
286 }
287 else {
288 intrinsic = "llvm.x86.avx.max.ps.256";
289 intr_size = 256;
290 }
291 }
292 if (type.width == 64 && util_cpu_caps.has_sse2) {
293 if (type.length == 1) {
294 intrinsic = "llvm.x86.sse2.max.sd";
295 intr_size = 128;
296 }
297 else if (type.length == 2 || !util_cpu_caps.has_avx) {
298 intrinsic = "llvm.x86.sse2.max.pd";
299 intr_size = 128;
300 }
301 else {
302 intrinsic = "llvm.x86.avx.max.pd.256";
303 intr_size = 256;
304 }
305 }
306 }
307 else if (type.floating && util_cpu_caps.has_altivec) {
308 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
309 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
310 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
311 __FUNCTION__);
312 }
313 if (type.width == 32 || type.length == 4) {
314 intrinsic = "llvm.ppc.altivec.vmaxfp";
315 intr_size = 128;
316 }
317 } else if (util_cpu_caps.has_altivec) {
318 intr_size = 128;
319 if (type.width == 8) {
320 if (!type.sign) {
321 intrinsic = "llvm.ppc.altivec.vmaxub";
322 } else {
323 intrinsic = "llvm.ppc.altivec.vmaxsb";
324 }
325 } else if (type.width == 16) {
326 if (!type.sign) {
327 intrinsic = "llvm.ppc.altivec.vmaxuh";
328 } else {
329 intrinsic = "llvm.ppc.altivec.vmaxsh";
330 }
331 } else if (type.width == 32) {
332 if (!type.sign) {
333 intrinsic = "llvm.ppc.altivec.vmaxuw";
334 } else {
335 intrinsic = "llvm.ppc.altivec.vmaxsw";
336 }
337 }
338 }
339
340 if (intrinsic) {
341 if (util_cpu_caps.has_sse && type.floating &&
342 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
343 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
344 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
345 LLVMValueRef isnan, max;
346 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
347 type,
348 intr_size, a, b);
349 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
350 isnan = lp_build_isnan(bld, b);
351 return lp_build_select(bld, isnan, a, max);
352 } else {
353 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
354 isnan = lp_build_isnan(bld, a);
355 return lp_build_select(bld, isnan, a, max);
356 }
357 } else {
358 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
359 type,
360 intr_size, a, b);
361 }
362 }
363
364 if (type.floating) {
365 switch (nan_behavior) {
366 case GALLIVM_NAN_RETURN_NAN: {
367 LLVMValueRef isnan = lp_build_isnan(bld, b);
368 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
369 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
370 return lp_build_select(bld, cond, a, b);
371 }
372 break;
373 case GALLIVM_NAN_RETURN_OTHER: {
374 LLVMValueRef isnan = lp_build_isnan(bld, a);
375 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
376 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
377 return lp_build_select(bld, cond, a, b);
378 }
379 break;
380 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
381 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
382 return lp_build_select(bld, cond, a, b);
383 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
384 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
385 return lp_build_select(bld, cond, b, a);
386 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
387 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
388 return lp_build_select(bld, cond, a, b);
389 break;
390 default:
391 assert(0);
392 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
393 return lp_build_select(bld, cond, a, b);
394 }
395 } else {
396 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397 return lp_build_select(bld, cond, a, b);
398 }
399 }
400
401
402 /**
403 * Generate 1 - a, or ~a depending on bld->type.
404 */
405 LLVMValueRef
406 lp_build_comp(struct lp_build_context *bld,
407 LLVMValueRef a)
408 {
409 LLVMBuilderRef builder = bld->gallivm->builder;
410 const struct lp_type type = bld->type;
411
412 assert(lp_check_value(type, a));
413
414 if(a == bld->one)
415 return bld->zero;
416 if(a == bld->zero)
417 return bld->one;
418
419 if(type.norm && !type.floating && !type.fixed && !type.sign) {
420 if(LLVMIsConstant(a))
421 return LLVMConstNot(a);
422 else
423 return LLVMBuildNot(builder, a, "");
424 }
425
426 if(LLVMIsConstant(a))
427 if (type.floating)
428 return LLVMConstFSub(bld->one, a);
429 else
430 return LLVMConstSub(bld->one, a);
431 else
432 if (type.floating)
433 return LLVMBuildFSub(builder, bld->one, a, "");
434 else
435 return LLVMBuildSub(builder, bld->one, a, "");
436 }
437
438
439 /**
440 * Generate a + b
441 */
442 LLVMValueRef
443 lp_build_add(struct lp_build_context *bld,
444 LLVMValueRef a,
445 LLVMValueRef b)
446 {
447 LLVMBuilderRef builder = bld->gallivm->builder;
448 const struct lp_type type = bld->type;
449 LLVMValueRef res;
450
451 assert(lp_check_value(type, a));
452 assert(lp_check_value(type, b));
453
454 if (a == bld->zero)
455 return b;
456 if (b == bld->zero)
457 return a;
458 if (a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if (type.norm) {
462 const char *intrinsic = NULL;
463
464 if (!type.sign && (a == bld->one || b == bld->one))
465 return bld->one;
466
467 if (!type.floating && !type.fixed) {
468 if (LLVM_VERSION_MAJOR >= 8) {
469 char intrin[32];
470 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
471 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
472 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
473 }
474 if (type.width * type.length == 128) {
475 if (util_cpu_caps.has_sse2) {
476 if (type.width == 8)
477 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
478 if (type.width == 16)
479 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
480 } else if (util_cpu_caps.has_altivec) {
481 if (type.width == 8)
482 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
483 if (type.width == 16)
484 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
485 }
486 }
487 if (type.width * type.length == 256) {
488 if (util_cpu_caps.has_avx2) {
489 if (type.width == 8)
490 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
491 if (type.width == 16)
492 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
493 }
494 }
495 }
496
497 if (intrinsic)
498 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
499 }
500
501 if(type.norm && !type.floating && !type.fixed) {
502 if (type.sign) {
503 uint64_t sign = (uint64_t)1 << (type.width - 1);
504 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
505 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
506 /* a_clamp_max is the maximum a for positive b,
507 a_clamp_min is the minimum a for negative b. */
508 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
509 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
510 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
511 }
512 }
513
514 if(LLVMIsConstant(a) && LLVMIsConstant(b))
515 if (type.floating)
516 res = LLVMConstFAdd(a, b);
517 else
518 res = LLVMConstAdd(a, b);
519 else
520 if (type.floating)
521 res = LLVMBuildFAdd(builder, a, b, "");
522 else
523 res = LLVMBuildAdd(builder, a, b, "");
524
525 /* clamp to ceiling of 1.0 */
526 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
527 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
528
529 if (type.norm && !type.floating && !type.fixed) {
530 if (!type.sign) {
531 /*
532 * newer llvm versions no longer support the intrinsics, but recognize
533 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
534 * code, it is important we match the pattern llvm uses (and pray llvm
535 * doesn't change it - and hope they decide on the same pattern for
536 * all backends supporting it...).
537 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
538 * interfere with llvm's ability to recognize the pattern but seems
539 * a bit brittle.
540 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
541 */
542 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
543 res = lp_build_select(bld, overflowed,
544 LLVMConstAllOnes(bld->int_vec_type), res);
545 }
546 }
547
548 /* XXX clamp to floor of -1 or 0??? */
549
550 return res;
551 }
552
553
554 /** Return the scalar sum of the elements of a.
555 * Should avoid this operation whenever possible.
556 */
557 LLVMValueRef
558 lp_build_horizontal_add(struct lp_build_context *bld,
559 LLVMValueRef a)
560 {
561 LLVMBuilderRef builder = bld->gallivm->builder;
562 const struct lp_type type = bld->type;
563 LLVMValueRef index, res;
564 unsigned i, length;
565 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
566 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
567 LLVMValueRef vecres, elem2;
568
569 assert(lp_check_value(type, a));
570
571 if (type.length == 1) {
572 return a;
573 }
574
575 assert(!bld->type.norm);
576
577 /*
578 * for byte vectors can do much better with psadbw.
579 * Using repeated shuffle/adds here. Note with multiple vectors
580 * this can be done more efficiently as outlined in the intel
581 * optimization manual.
582 * Note: could cause data rearrangement if used with smaller element
583 * sizes.
584 */
585
586 vecres = a;
587 length = type.length / 2;
588 while (length > 1) {
589 LLVMValueRef vec1, vec2;
590 for (i = 0; i < length; i++) {
591 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
592 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
593 }
594 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
595 LLVMConstVector(shuffles1, length), "");
596 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
597 LLVMConstVector(shuffles2, length), "");
598 if (type.floating) {
599 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
600 }
601 else {
602 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
603 }
604 length = length >> 1;
605 }
606
607 /* always have vector of size 2 here */
608 assert(length == 1);
609
610 index = lp_build_const_int32(bld->gallivm, 0);
611 res = LLVMBuildExtractElement(builder, vecres, index, "");
612 index = lp_build_const_int32(bld->gallivm, 1);
613 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
614
615 if (type.floating)
616 res = LLVMBuildFAdd(builder, res, elem2, "");
617 else
618 res = LLVMBuildAdd(builder, res, elem2, "");
619
620 return res;
621 }
622
623 /**
624 * Return the horizontal sums of 4 float vectors as a float4 vector.
625 * This uses the technique as outlined in Intel Optimization Manual.
626 */
627 static LLVMValueRef
628 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
629 LLVMValueRef src[4])
630 {
631 struct gallivm_state *gallivm = bld->gallivm;
632 LLVMBuilderRef builder = gallivm->builder;
633 LLVMValueRef shuffles[4];
634 LLVMValueRef tmp[4];
635 LLVMValueRef sumtmp[2], shuftmp[2];
636
637 /* lower half of regs */
638 shuffles[0] = lp_build_const_int32(gallivm, 0);
639 shuffles[1] = lp_build_const_int32(gallivm, 1);
640 shuffles[2] = lp_build_const_int32(gallivm, 4);
641 shuffles[3] = lp_build_const_int32(gallivm, 5);
642 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
643 LLVMConstVector(shuffles, 4), "");
644 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
645 LLVMConstVector(shuffles, 4), "");
646
647 /* upper half of regs */
648 shuffles[0] = lp_build_const_int32(gallivm, 2);
649 shuffles[1] = lp_build_const_int32(gallivm, 3);
650 shuffles[2] = lp_build_const_int32(gallivm, 6);
651 shuffles[3] = lp_build_const_int32(gallivm, 7);
652 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
653 LLVMConstVector(shuffles, 4), "");
654 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
655 LLVMConstVector(shuffles, 4), "");
656
657 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
658 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
659
660 shuffles[0] = lp_build_const_int32(gallivm, 0);
661 shuffles[1] = lp_build_const_int32(gallivm, 2);
662 shuffles[2] = lp_build_const_int32(gallivm, 4);
663 shuffles[3] = lp_build_const_int32(gallivm, 6);
664 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
665 LLVMConstVector(shuffles, 4), "");
666
667 shuffles[0] = lp_build_const_int32(gallivm, 1);
668 shuffles[1] = lp_build_const_int32(gallivm, 3);
669 shuffles[2] = lp_build_const_int32(gallivm, 5);
670 shuffles[3] = lp_build_const_int32(gallivm, 7);
671 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672 LLVMConstVector(shuffles, 4), "");
673
674 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
675 }
676
677
678 /*
679 * partially horizontally add 2-4 float vectors with length nx4,
680 * i.e. only four adjacent values in each vector will be added,
681 * assuming values are really grouped in 4 which also determines
682 * output order.
683 *
684 * Return a vector of the same length as the initial vectors,
685 * with the excess elements (if any) being undefined.
686 * The element order is independent of number of input vectors.
687 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
688 * the output order thus will be
689 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
690 */
691 LLVMValueRef
692 lp_build_hadd_partial4(struct lp_build_context *bld,
693 LLVMValueRef vectors[],
694 unsigned num_vecs)
695 {
696 struct gallivm_state *gallivm = bld->gallivm;
697 LLVMBuilderRef builder = gallivm->builder;
698 LLVMValueRef ret_vec;
699 LLVMValueRef tmp[4];
700 const char *intrinsic = NULL;
701
702 assert(num_vecs >= 2 && num_vecs <= 4);
703 assert(bld->type.floating);
704
705 /* only use this with at least 2 vectors, as it is sort of expensive
706 * (depending on cpu) and we always need two horizontal adds anyway,
707 * so a shuffle/add approach might be better.
708 */
709
710 tmp[0] = vectors[0];
711 tmp[1] = vectors[1];
712
713 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
714 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
715
716 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
717 bld->type.length == 4) {
718 intrinsic = "llvm.x86.sse3.hadd.ps";
719 }
720 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
721 bld->type.length == 8) {
722 intrinsic = "llvm.x86.avx.hadd.ps.256";
723 }
724 if (intrinsic) {
725 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
726 lp_build_vec_type(gallivm, bld->type),
727 tmp[0], tmp[1]);
728 if (num_vecs > 2) {
729 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
730 lp_build_vec_type(gallivm, bld->type),
731 tmp[2], tmp[3]);
732 }
733 else {
734 tmp[1] = tmp[0];
735 }
736 return lp_build_intrinsic_binary(builder, intrinsic,
737 lp_build_vec_type(gallivm, bld->type),
738 tmp[0], tmp[1]);
739 }
740
741 if (bld->type.length == 4) {
742 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
743 }
744 else {
745 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
746 unsigned j;
747 unsigned num_iter = bld->type.length / 4;
748 struct lp_type parttype = bld->type;
749 parttype.length = 4;
750 for (j = 0; j < num_iter; j++) {
751 LLVMValueRef partsrc[4];
752 unsigned i;
753 for (i = 0; i < 4; i++) {
754 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
755 }
756 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
757 }
758 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
759 }
760 return ret_vec;
761 }
762
763 /**
764 * Generate a - b
765 */
766 LLVMValueRef
767 lp_build_sub(struct lp_build_context *bld,
768 LLVMValueRef a,
769 LLVMValueRef b)
770 {
771 LLVMBuilderRef builder = bld->gallivm->builder;
772 const struct lp_type type = bld->type;
773 LLVMValueRef res;
774
775 assert(lp_check_value(type, a));
776 assert(lp_check_value(type, b));
777
778 if (b == bld->zero)
779 return a;
780 if (a == bld->undef || b == bld->undef)
781 return bld->undef;
782 if (a == b)
783 return bld->zero;
784
785 if (type.norm) {
786 const char *intrinsic = NULL;
787
788 if (!type.sign && b == bld->one)
789 return bld->zero;
790
791 if (!type.floating && !type.fixed) {
792 if (LLVM_VERSION_MAJOR >= 8) {
793 char intrin[32];
794 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
795 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
796 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
797 }
798 if (type.width * type.length == 128) {
799 if (util_cpu_caps.has_sse2) {
800 if (type.width == 8)
801 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
802 if (type.width == 16)
803 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
804 } else if (util_cpu_caps.has_altivec) {
805 if (type.width == 8)
806 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
807 if (type.width == 16)
808 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
809 }
810 }
811 if (type.width * type.length == 256) {
812 if (util_cpu_caps.has_avx2) {
813 if (type.width == 8)
814 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
815 if (type.width == 16)
816 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
817 }
818 }
819 }
820
821 if (intrinsic)
822 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
823 }
824
825 if(type.norm && !type.floating && !type.fixed) {
826 if (type.sign) {
827 uint64_t sign = (uint64_t)1 << (type.width - 1);
828 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
829 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
830 /* a_clamp_max is the maximum a for negative b,
831 a_clamp_min is the minimum a for positive b. */
832 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
833 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
834 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
835 } else {
836 /*
837 * This must match llvm pattern for saturated unsigned sub.
838 * (lp_build_max_simple actually does the job with its current
839 * definition but do it explicitly here.)
840 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
841 * interfere with llvm's ability to recognize the pattern but seems
842 * a bit brittle.
843 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
844 */
845 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
846 a = lp_build_select(bld, no_ov, a, b);
847 }
848 }
849
850 if(LLVMIsConstant(a) && LLVMIsConstant(b))
851 if (type.floating)
852 res = LLVMConstFSub(a, b);
853 else
854 res = LLVMConstSub(a, b);
855 else
856 if (type.floating)
857 res = LLVMBuildFSub(builder, a, b, "");
858 else
859 res = LLVMBuildSub(builder, a, b, "");
860
861 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
862 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
863
864 return res;
865 }
866
867
868
869 /**
870 * Normalized multiplication.
871 *
872 * There are several approaches for (using 8-bit normalized multiplication as
873 * an example):
874 *
875 * - alpha plus one
876 *
877 * makes the following approximation to the division (Sree)
878 *
879 * a*b/255 ~= (a*(b + 1)) >> 256
880 *
881 * which is the fastest method that satisfies the following OpenGL criteria of
882 *
883 * 0*0 = 0 and 255*255 = 255
884 *
885 * - geometric series
886 *
887 * takes the geometric series approximation to the division
888 *
889 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
890 *
891 * in this case just the first two terms to fit in 16bit arithmetic
892 *
893 * t/255 ~= (t + (t >> 8)) >> 8
894 *
895 * note that just by itself it doesn't satisfies the OpenGL criteria, as
896 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
897 * must be used.
898 *
899 * - geometric series plus rounding
900 *
901 * when using a geometric series division instead of truncating the result
902 * use roundoff in the approximation (Jim Blinn)
903 *
904 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
905 *
906 * achieving the exact results.
907 *
908 *
909 *
910 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
911 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
912 * @sa Michael Herf, The "double blend trick", May 2000,
913 * http://www.stereopsis.com/doubleblend.html
914 */
915 LLVMValueRef
916 lp_build_mul_norm(struct gallivm_state *gallivm,
917 struct lp_type wide_type,
918 LLVMValueRef a, LLVMValueRef b)
919 {
920 LLVMBuilderRef builder = gallivm->builder;
921 struct lp_build_context bld;
922 unsigned n;
923 LLVMValueRef half;
924 LLVMValueRef ab;
925
926 assert(!wide_type.floating);
927 assert(lp_check_value(wide_type, a));
928 assert(lp_check_value(wide_type, b));
929
930 lp_build_context_init(&bld, gallivm, wide_type);
931
932 n = wide_type.width / 2;
933 if (wide_type.sign) {
934 --n;
935 }
936
937 /*
938 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
939 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
940 */
941
942 /*
943 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
944 */
945
946 ab = LLVMBuildMul(builder, a, b, "");
947 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
948
949 /*
950 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
951 */
952
953 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
954 if (wide_type.sign) {
955 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
956 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
957 half = lp_build_select(&bld, sign, minus_half, half);
958 }
959 ab = LLVMBuildAdd(builder, ab, half, "");
960
961 /* Final division */
962 ab = lp_build_shr_imm(&bld, ab, n);
963
964 return ab;
965 }
966
967 /**
968 * Generate a * b
969 */
970 LLVMValueRef
971 lp_build_mul(struct lp_build_context *bld,
972 LLVMValueRef a,
973 LLVMValueRef b)
974 {
975 LLVMBuilderRef builder = bld->gallivm->builder;
976 const struct lp_type type = bld->type;
977 LLVMValueRef shift;
978 LLVMValueRef res;
979
980 assert(lp_check_value(type, a));
981 assert(lp_check_value(type, b));
982
983 if(a == bld->zero)
984 return bld->zero;
985 if(a == bld->one)
986 return b;
987 if(b == bld->zero)
988 return bld->zero;
989 if(b == bld->one)
990 return a;
991 if(a == bld->undef || b == bld->undef)
992 return bld->undef;
993
994 if (!type.floating && !type.fixed && type.norm) {
995 struct lp_type wide_type = lp_wider_type(type);
996 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
997
998 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
999 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1000
1001 /* PMULLW, PSRLW, PADDW */
1002 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1003 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1004
1005 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1006
1007 return ab;
1008 }
1009
1010 if(type.fixed)
1011 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1012 else
1013 shift = NULL;
1014
1015 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1016 if (type.floating)
1017 res = LLVMConstFMul(a, b);
1018 else
1019 res = LLVMConstMul(a, b);
1020 if(shift) {
1021 if(type.sign)
1022 res = LLVMConstAShr(res, shift);
1023 else
1024 res = LLVMConstLShr(res, shift);
1025 }
1026 }
1027 else {
1028 if (type.floating)
1029 res = LLVMBuildFMul(builder, a, b, "");
1030 else
1031 res = LLVMBuildMul(builder, a, b, "");
1032 if(shift) {
1033 if(type.sign)
1034 res = LLVMBuildAShr(builder, res, shift, "");
1035 else
1036 res = LLVMBuildLShr(builder, res, shift, "");
1037 }
1038 }
1039
1040 return res;
1041 }
1042
1043 /*
1044 * Widening mul, valid for 32x32 bit -> 64bit only.
1045 * Result is low 32bits, high bits returned in res_hi.
1046 *
1047 * Emits code that is meant to be compiled for the host CPU.
1048 */
1049 LLVMValueRef
1050 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1051 LLVMValueRef a,
1052 LLVMValueRef b,
1053 LLVMValueRef *res_hi)
1054 {
1055 struct gallivm_state *gallivm = bld->gallivm;
1056 LLVMBuilderRef builder = gallivm->builder;
1057
1058 assert(bld->type.width == 32);
1059 assert(bld->type.floating == 0);
1060 assert(bld->type.fixed == 0);
1061 assert(bld->type.norm == 0);
1062
1063 /*
1064 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1065 * for x86 simd is atrocious (even if the high bits weren't required),
1066 * trying to handle real 64bit inputs (which of course can't happen due
1067 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1068 * apparently llvm does not recognize this widening mul). This includes 6
1069 * (instead of 2) pmuludq plus extra adds and shifts
1070 * The same story applies to signed mul, albeit fixing this requires sse41.
1071 * https://llvm.org/bugs/show_bug.cgi?id=30845
1072 * So, whip up our own code, albeit only for length 4 and 8 (which
1073 * should be good enough)...
1074 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1075 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1076 * for signed), which the fallback code does not, without this llvm
1077 * will likely still produce atrocious code.
1078 */
1079 if (LLVM_VERSION_MAJOR < 7 &&
1080 (bld->type.length == 4 || bld->type.length == 8) &&
1081 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1082 util_cpu_caps.has_sse4_1)) {
1083 const char *intrinsic = NULL;
1084 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1085 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1086 struct lp_type type_wide = lp_wider_type(bld->type);
1087 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1088 unsigned i;
1089 for (i = 0; i < bld->type.length; i += 2) {
1090 shuf[i] = lp_build_const_int32(gallivm, i+1);
1091 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1092 }
1093 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1094 aeven = a;
1095 beven = b;
1096 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1097 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1098
1099 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1100 if (bld->type.sign) {
1101 intrinsic = "llvm.x86.avx2.pmul.dq";
1102 } else {
1103 intrinsic = "llvm.x86.avx2.pmulu.dq";
1104 }
1105 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1106 wider_type, aeven, beven);
1107 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1108 wider_type, aodd, bodd);
1109 }
1110 else {
1111 /* for consistent naming look elsewhere... */
1112 if (bld->type.sign) {
1113 intrinsic = "llvm.x86.sse41.pmuldq";
1114 } else {
1115 intrinsic = "llvm.x86.sse2.pmulu.dq";
1116 }
1117 /*
1118 * XXX If we only have AVX but not AVX2 this is a pain.
1119 * lp_build_intrinsic_binary_anylength() can't handle it
1120 * (due to src and dst type not being identical).
1121 */
1122 if (bld->type.length == 8) {
1123 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1124 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1125 LLVMValueRef muleven2[2], mulodd2[2];
1126 struct lp_type type_wide_half = type_wide;
1127 LLVMTypeRef wtype_half;
1128 type_wide_half.length = 2;
1129 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1130 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1131 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1132 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1133 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1134 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1135 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1136 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1137 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1138 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1139 wtype_half, aevenlo, bevenlo);
1140 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1141 wtype_half, aoddlo, boddlo);
1142 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1143 wtype_half, aevenhi, bevenhi);
1144 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1145 wtype_half, aoddhi, boddhi);
1146 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1147 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1148
1149 }
1150 else {
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 }
1157 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1158 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1159
1160 for (i = 0; i < bld->type.length; i += 2) {
1161 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1162 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1163 }
1164 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1165 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1166
1167 for (i = 0; i < bld->type.length; i += 2) {
1168 shuf[i] = lp_build_const_int32(gallivm, i);
1169 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1170 }
1171 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1172 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1173 }
1174 else {
1175 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1176 }
1177 }
1178
1179
1180 /*
1181 * Widening mul, valid for 32x32 bit -> 64bit only.
1182 * Result is low 32bits, high bits returned in res_hi.
1183 *
1184 * Emits generic code.
1185 */
1186 LLVMValueRef
1187 lp_build_mul_32_lohi(struct lp_build_context *bld,
1188 LLVMValueRef a,
1189 LLVMValueRef b,
1190 LLVMValueRef *res_hi)
1191 {
1192 struct gallivm_state *gallivm = bld->gallivm;
1193 LLVMBuilderRef builder = gallivm->builder;
1194 LLVMValueRef tmp, shift, res_lo;
1195 struct lp_type type_tmp;
1196 LLVMTypeRef wide_type, narrow_type;
1197
1198 type_tmp = bld->type;
1199 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1200 type_tmp.width *= 2;
1201 wide_type = lp_build_vec_type(gallivm, type_tmp);
1202 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1203
1204 if (bld->type.sign) {
1205 a = LLVMBuildSExt(builder, a, wide_type, "");
1206 b = LLVMBuildSExt(builder, b, wide_type, "");
1207 } else {
1208 a = LLVMBuildZExt(builder, a, wide_type, "");
1209 b = LLVMBuildZExt(builder, b, wide_type, "");
1210 }
1211 tmp = LLVMBuildMul(builder, a, b, "");
1212
1213 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1214
1215 /* Since we truncate anyway, LShr and AShr are equivalent. */
1216 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1217 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1218
1219 return res_lo;
1220 }
1221
1222
1223 /* a * b + c */
1224 LLVMValueRef
1225 lp_build_mad(struct lp_build_context *bld,
1226 LLVMValueRef a,
1227 LLVMValueRef b,
1228 LLVMValueRef c)
1229 {
1230 const struct lp_type type = bld->type;
1231 if (type.floating) {
1232 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1233 } else {
1234 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1235 }
1236 }
1237
1238
1239 /**
1240 * Small vector x scale multiplication optimization.
1241 */
1242 LLVMValueRef
1243 lp_build_mul_imm(struct lp_build_context *bld,
1244 LLVMValueRef a,
1245 int b)
1246 {
1247 LLVMBuilderRef builder = bld->gallivm->builder;
1248 LLVMValueRef factor;
1249
1250 assert(lp_check_value(bld->type, a));
1251
1252 if(b == 0)
1253 return bld->zero;
1254
1255 if(b == 1)
1256 return a;
1257
1258 if(b == -1)
1259 return lp_build_negate(bld, a);
1260
1261 if(b == 2 && bld->type.floating)
1262 return lp_build_add(bld, a, a);
1263
1264 if(util_is_power_of_two_or_zero(b)) {
1265 unsigned shift = ffs(b) - 1;
1266
1267 if(bld->type.floating) {
1268 #if 0
1269 /*
1270 * Power of two multiplication by directly manipulating the exponent.
1271 *
1272 * XXX: This might not be always faster, it will introduce a small error
1273 * for multiplication by zero, and it will produce wrong results
1274 * for Inf and NaN.
1275 */
1276 unsigned mantissa = lp_mantissa(bld->type);
1277 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1278 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1279 a = LLVMBuildAdd(builder, a, factor, "");
1280 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1281 return a;
1282 #endif
1283 }
1284 else {
1285 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1286 return LLVMBuildShl(builder, a, factor, "");
1287 }
1288 }
1289
1290 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1291 return lp_build_mul(bld, a, factor);
1292 }
1293
1294
1295 /**
1296 * Generate a / b
1297 */
1298 LLVMValueRef
1299 lp_build_div(struct lp_build_context *bld,
1300 LLVMValueRef a,
1301 LLVMValueRef b)
1302 {
1303 LLVMBuilderRef builder = bld->gallivm->builder;
1304 const struct lp_type type = bld->type;
1305
1306 assert(lp_check_value(type, a));
1307 assert(lp_check_value(type, b));
1308
1309 if(a == bld->zero)
1310 return bld->zero;
1311 if(a == bld->one && type.floating)
1312 return lp_build_rcp(bld, b);
1313 if(b == bld->zero)
1314 return bld->undef;
1315 if(b == bld->one)
1316 return a;
1317 if(a == bld->undef || b == bld->undef)
1318 return bld->undef;
1319
1320 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1321 if (type.floating)
1322 return LLVMConstFDiv(a, b);
1323 else if (type.sign)
1324 return LLVMConstSDiv(a, b);
1325 else
1326 return LLVMConstUDiv(a, b);
1327 }
1328
1329 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1330 if(FALSE &&
1331 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1332 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1333 type.floating)
1334 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1335
1336 if (type.floating)
1337 return LLVMBuildFDiv(builder, a, b, "");
1338 else if (type.sign)
1339 return LLVMBuildSDiv(builder, a, b, "");
1340 else
1341 return LLVMBuildUDiv(builder, a, b, "");
1342 }
1343
1344
1345 /**
1346 * Linear interpolation helper.
1347 *
1348 * @param normalized whether we are interpolating normalized values,
1349 * encoded in normalized integers, twice as wide.
1350 *
1351 * @sa http://www.stereopsis.com/doubleblend.html
1352 */
1353 static inline LLVMValueRef
1354 lp_build_lerp_simple(struct lp_build_context *bld,
1355 LLVMValueRef x,
1356 LLVMValueRef v0,
1357 LLVMValueRef v1,
1358 unsigned flags)
1359 {
1360 unsigned half_width = bld->type.width/2;
1361 LLVMBuilderRef builder = bld->gallivm->builder;
1362 LLVMValueRef delta;
1363 LLVMValueRef res;
1364
1365 assert(lp_check_value(bld->type, x));
1366 assert(lp_check_value(bld->type, v0));
1367 assert(lp_check_value(bld->type, v1));
1368
1369 delta = lp_build_sub(bld, v1, v0);
1370
1371 if (bld->type.floating) {
1372 assert(flags == 0);
1373 return lp_build_mad(bld, x, delta, v0);
1374 }
1375
1376 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1377 if (!bld->type.sign) {
1378 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1379 /*
1380 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1381 * most-significant-bit to the lowest-significant-bit, so that
1382 * later we can just divide by 2**n instead of 2**n - 1.
1383 */
1384
1385 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1386 }
1387
1388 /* (x * delta) >> n */
1389 res = lp_build_mul(bld, x, delta);
1390 res = lp_build_shr_imm(bld, res, half_width);
1391 } else {
1392 /*
1393 * The rescaling trick above doesn't work for signed numbers, so
1394 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1395 * instead.
1396 */
1397 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1398 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1399 }
1400 } else {
1401 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1402 res = lp_build_mul(bld, x, delta);
1403 }
1404
1405 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1406 /*
1407 * At this point both res and v0 only use the lower half of the bits,
1408 * the rest is zero. Instead of add / mask, do add with half wide type.
1409 */
1410 struct lp_type narrow_type;
1411 struct lp_build_context narrow_bld;
1412
1413 memset(&narrow_type, 0, sizeof narrow_type);
1414 narrow_type.sign = bld->type.sign;
1415 narrow_type.width = bld->type.width/2;
1416 narrow_type.length = bld->type.length*2;
1417
1418 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1419 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1420 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1421 res = lp_build_add(&narrow_bld, v0, res);
1422 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1423 } else {
1424 res = lp_build_add(bld, v0, res);
1425
1426 if (bld->type.fixed) {
1427 /*
1428 * We need to mask out the high order bits when lerping 8bit
1429 * normalized colors stored on 16bits
1430 */
1431 /* XXX: This step is necessary for lerping 8bit colors stored on
1432 * 16bits, but it will be wrong for true fixed point use cases.
1433 * Basically we need a more powerful lp_type, capable of further
1434 * distinguishing the values interpretation from the value storage.
1435 */
1436 LLVMValueRef low_bits;
1437 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1438 res = LLVMBuildAnd(builder, res, low_bits, "");
1439 }
1440 }
1441
1442 return res;
1443 }
1444
1445
1446 /**
1447 * Linear interpolation.
1448 */
1449 LLVMValueRef
1450 lp_build_lerp(struct lp_build_context *bld,
1451 LLVMValueRef x,
1452 LLVMValueRef v0,
1453 LLVMValueRef v1,
1454 unsigned flags)
1455 {
1456 const struct lp_type type = bld->type;
1457 LLVMValueRef res;
1458
1459 assert(lp_check_value(type, x));
1460 assert(lp_check_value(type, v0));
1461 assert(lp_check_value(type, v1));
1462
1463 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1464
1465 if (type.norm) {
1466 struct lp_type wide_type;
1467 struct lp_build_context wide_bld;
1468 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1469
1470 assert(type.length >= 2);
1471
1472 /*
1473 * Create a wider integer type, enough to hold the
1474 * intermediate result of the multiplication.
1475 */
1476 memset(&wide_type, 0, sizeof wide_type);
1477 wide_type.sign = type.sign;
1478 wide_type.width = type.width*2;
1479 wide_type.length = type.length/2;
1480
1481 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1482
1483 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1484 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1485 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1486
1487 /*
1488 * Lerp both halves.
1489 */
1490
1491 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1492
1493 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1494 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1495
1496 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1497 } else {
1498 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1499 }
1500
1501 return res;
1502 }
1503
1504
1505 /**
1506 * Bilinear interpolation.
1507 *
1508 * Values indices are in v_{yx}.
1509 */
1510 LLVMValueRef
1511 lp_build_lerp_2d(struct lp_build_context *bld,
1512 LLVMValueRef x,
1513 LLVMValueRef y,
1514 LLVMValueRef v00,
1515 LLVMValueRef v01,
1516 LLVMValueRef v10,
1517 LLVMValueRef v11,
1518 unsigned flags)
1519 {
1520 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1521 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1522 return lp_build_lerp(bld, y, v0, v1, flags);
1523 }
1524
1525
1526 LLVMValueRef
1527 lp_build_lerp_3d(struct lp_build_context *bld,
1528 LLVMValueRef x,
1529 LLVMValueRef y,
1530 LLVMValueRef z,
1531 LLVMValueRef v000,
1532 LLVMValueRef v001,
1533 LLVMValueRef v010,
1534 LLVMValueRef v011,
1535 LLVMValueRef v100,
1536 LLVMValueRef v101,
1537 LLVMValueRef v110,
1538 LLVMValueRef v111,
1539 unsigned flags)
1540 {
1541 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1542 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1543 return lp_build_lerp(bld, z, v0, v1, flags);
1544 }
1545
1546
1547 /**
1548 * Generate min(a, b)
1549 * Do checks for special cases but not for nans.
1550 */
1551 LLVMValueRef
1552 lp_build_min(struct lp_build_context *bld,
1553 LLVMValueRef a,
1554 LLVMValueRef b)
1555 {
1556 assert(lp_check_value(bld->type, a));
1557 assert(lp_check_value(bld->type, b));
1558
1559 if(a == bld->undef || b == bld->undef)
1560 return bld->undef;
1561
1562 if(a == b)
1563 return a;
1564
1565 if (bld->type.norm) {
1566 if (!bld->type.sign) {
1567 if (a == bld->zero || b == bld->zero) {
1568 return bld->zero;
1569 }
1570 }
1571 if(a == bld->one)
1572 return b;
1573 if(b == bld->one)
1574 return a;
1575 }
1576
1577 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1578 }
1579
1580
1581 /**
1582 * Generate min(a, b)
1583 * NaN's are handled according to the behavior specified by the
1584 * nan_behavior argument.
1585 */
1586 LLVMValueRef
1587 lp_build_min_ext(struct lp_build_context *bld,
1588 LLVMValueRef a,
1589 LLVMValueRef b,
1590 enum gallivm_nan_behavior nan_behavior)
1591 {
1592 assert(lp_check_value(bld->type, a));
1593 assert(lp_check_value(bld->type, b));
1594
1595 if(a == bld->undef || b == bld->undef)
1596 return bld->undef;
1597
1598 if(a == b)
1599 return a;
1600
1601 if (bld->type.norm) {
1602 if (!bld->type.sign) {
1603 if (a == bld->zero || b == bld->zero) {
1604 return bld->zero;
1605 }
1606 }
1607 if(a == bld->one)
1608 return b;
1609 if(b == bld->one)
1610 return a;
1611 }
1612
1613 return lp_build_min_simple(bld, a, b, nan_behavior);
1614 }
1615
1616 /**
1617 * Generate max(a, b)
1618 * Do checks for special cases, but NaN behavior is undefined.
1619 */
1620 LLVMValueRef
1621 lp_build_max(struct lp_build_context *bld,
1622 LLVMValueRef a,
1623 LLVMValueRef b)
1624 {
1625 assert(lp_check_value(bld->type, a));
1626 assert(lp_check_value(bld->type, b));
1627
1628 if(a == bld->undef || b == bld->undef)
1629 return bld->undef;
1630
1631 if(a == b)
1632 return a;
1633
1634 if(bld->type.norm) {
1635 if(a == bld->one || b == bld->one)
1636 return bld->one;
1637 if (!bld->type.sign) {
1638 if (a == bld->zero) {
1639 return b;
1640 }
1641 if (b == bld->zero) {
1642 return a;
1643 }
1644 }
1645 }
1646
1647 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1648 }
1649
1650
1651 /**
1652 * Generate max(a, b)
1653 * Checks for special cases.
1654 * NaN's are handled according to the behavior specified by the
1655 * nan_behavior argument.
1656 */
1657 LLVMValueRef
1658 lp_build_max_ext(struct lp_build_context *bld,
1659 LLVMValueRef a,
1660 LLVMValueRef b,
1661 enum gallivm_nan_behavior nan_behavior)
1662 {
1663 assert(lp_check_value(bld->type, a));
1664 assert(lp_check_value(bld->type, b));
1665
1666 if(a == bld->undef || b == bld->undef)
1667 return bld->undef;
1668
1669 if(a == b)
1670 return a;
1671
1672 if(bld->type.norm) {
1673 if(a == bld->one || b == bld->one)
1674 return bld->one;
1675 if (!bld->type.sign) {
1676 if (a == bld->zero) {
1677 return b;
1678 }
1679 if (b == bld->zero) {
1680 return a;
1681 }
1682 }
1683 }
1684
1685 return lp_build_max_simple(bld, a, b, nan_behavior);
1686 }
1687
1688 /**
1689 * Generate clamp(a, min, max)
1690 * NaN behavior (for any of a, min, max) is undefined.
1691 * Do checks for special cases.
1692 */
1693 LLVMValueRef
1694 lp_build_clamp(struct lp_build_context *bld,
1695 LLVMValueRef a,
1696 LLVMValueRef min,
1697 LLVMValueRef max)
1698 {
1699 assert(lp_check_value(bld->type, a));
1700 assert(lp_check_value(bld->type, min));
1701 assert(lp_check_value(bld->type, max));
1702
1703 a = lp_build_min(bld, a, max);
1704 a = lp_build_max(bld, a, min);
1705 return a;
1706 }
1707
1708
1709 /**
1710 * Generate clamp(a, 0, 1)
1711 * A NaN will get converted to zero.
1712 */
1713 LLVMValueRef
1714 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1715 LLVMValueRef a)
1716 {
1717 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1718 a = lp_build_min(bld, a, bld->one);
1719 return a;
1720 }
1721
1722
1723 /**
1724 * Generate abs(a)
1725 */
1726 LLVMValueRef
1727 lp_build_abs(struct lp_build_context *bld,
1728 LLVMValueRef a)
1729 {
1730 LLVMBuilderRef builder = bld->gallivm->builder;
1731 const struct lp_type type = bld->type;
1732 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1733
1734 assert(lp_check_value(type, a));
1735
1736 if(!type.sign)
1737 return a;
1738
1739 if(type.floating) {
1740 char intrinsic[32];
1741 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1742 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1743 }
1744
1745 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1746 switch(type.width) {
1747 case 8:
1748 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1749 case 16:
1750 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1751 case 32:
1752 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1753 }
1754 }
1755 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1756 switch(type.width) {
1757 case 8:
1758 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1759 case 16:
1760 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1761 case 32:
1762 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1763 }
1764 }
1765
1766 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1767 a, LLVMBuildNeg(builder, a, ""));
1768 }
1769
1770
1771 LLVMValueRef
1772 lp_build_negate(struct lp_build_context *bld,
1773 LLVMValueRef a)
1774 {
1775 LLVMBuilderRef builder = bld->gallivm->builder;
1776
1777 assert(lp_check_value(bld->type, a));
1778
1779 if (bld->type.floating)
1780 a = LLVMBuildFNeg(builder, a, "");
1781 else
1782 a = LLVMBuildNeg(builder, a, "");
1783
1784 return a;
1785 }
1786
1787
1788 /** Return -1, 0 or +1 depending on the sign of a */
1789 LLVMValueRef
1790 lp_build_sgn(struct lp_build_context *bld,
1791 LLVMValueRef a)
1792 {
1793 LLVMBuilderRef builder = bld->gallivm->builder;
1794 const struct lp_type type = bld->type;
1795 LLVMValueRef cond;
1796 LLVMValueRef res;
1797
1798 assert(lp_check_value(type, a));
1799
1800 /* Handle non-zero case */
1801 if(!type.sign) {
1802 /* if not zero then sign must be positive */
1803 res = bld->one;
1804 }
1805 else if(type.floating) {
1806 LLVMTypeRef vec_type;
1807 LLVMTypeRef int_type;
1808 LLVMValueRef mask;
1809 LLVMValueRef sign;
1810 LLVMValueRef one;
1811 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1812
1813 int_type = lp_build_int_vec_type(bld->gallivm, type);
1814 vec_type = lp_build_vec_type(bld->gallivm, type);
1815 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1816
1817 /* Take the sign bit and add it to 1 constant */
1818 sign = LLVMBuildBitCast(builder, a, int_type, "");
1819 sign = LLVMBuildAnd(builder, sign, mask, "");
1820 one = LLVMConstBitCast(bld->one, int_type);
1821 res = LLVMBuildOr(builder, sign, one, "");
1822 res = LLVMBuildBitCast(builder, res, vec_type, "");
1823 }
1824 else
1825 {
1826 /* signed int/norm/fixed point */
1827 /* could use psign with sse3 and appropriate vectors here */
1828 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1829 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1830 res = lp_build_select(bld, cond, bld->one, minus_one);
1831 }
1832
1833 /* Handle zero */
1834 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1835 res = lp_build_select(bld, cond, bld->zero, res);
1836
1837 return res;
1838 }
1839
1840
1841 /**
1842 * Set the sign of float vector 'a' according to 'sign'.
1843 * If sign==0, return abs(a).
1844 * If sign==1, return -abs(a);
1845 * Other values for sign produce undefined results.
1846 */
1847 LLVMValueRef
1848 lp_build_set_sign(struct lp_build_context *bld,
1849 LLVMValueRef a, LLVMValueRef sign)
1850 {
1851 LLVMBuilderRef builder = bld->gallivm->builder;
1852 const struct lp_type type = bld->type;
1853 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1854 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1855 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1856 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1857 ~((unsigned long long) 1 << (type.width - 1)));
1858 LLVMValueRef val, res;
1859
1860 assert(type.floating);
1861 assert(lp_check_value(type, a));
1862
1863 /* val = reinterpret_cast<int>(a) */
1864 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1865 /* val = val & mask */
1866 val = LLVMBuildAnd(builder, val, mask, "");
1867 /* sign = sign << shift */
1868 sign = LLVMBuildShl(builder, sign, shift, "");
1869 /* res = val | sign */
1870 res = LLVMBuildOr(builder, val, sign, "");
1871 /* res = reinterpret_cast<float>(res) */
1872 res = LLVMBuildBitCast(builder, res, vec_type, "");
1873
1874 return res;
1875 }
1876
1877
1878 /**
1879 * Convert vector of (or scalar) int to vector of (or scalar) float.
1880 */
1881 LLVMValueRef
1882 lp_build_int_to_float(struct lp_build_context *bld,
1883 LLVMValueRef a)
1884 {
1885 LLVMBuilderRef builder = bld->gallivm->builder;
1886 const struct lp_type type = bld->type;
1887 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1888
1889 assert(type.floating);
1890
1891 return LLVMBuildSIToFP(builder, a, vec_type, "");
1892 }
1893
1894 static boolean
1895 arch_rounding_available(const struct lp_type type)
1896 {
1897 if ((util_cpu_caps.has_sse4_1 &&
1898 (type.length == 1 || type.width*type.length == 128)) ||
1899 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1900 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1901 return TRUE;
1902 else if ((util_cpu_caps.has_altivec &&
1903 (type.width == 32 && type.length == 4)))
1904 return TRUE;
1905 else if (util_cpu_caps.has_neon)
1906 return TRUE;
1907
1908 return FALSE;
1909 }
1910
1911 enum lp_build_round_mode
1912 {
1913 LP_BUILD_ROUND_NEAREST = 0,
1914 LP_BUILD_ROUND_FLOOR = 1,
1915 LP_BUILD_ROUND_CEIL = 2,
1916 LP_BUILD_ROUND_TRUNCATE = 3
1917 };
1918
1919 static inline LLVMValueRef
1920 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1921 LLVMValueRef a)
1922 {
1923 LLVMBuilderRef builder = bld->gallivm->builder;
1924 const struct lp_type type = bld->type;
1925 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1926 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1927 const char *intrinsic;
1928 LLVMValueRef res;
1929
1930 assert(type.floating);
1931 /* using the double precision conversions is a bit more complicated */
1932 assert(type.width == 32);
1933
1934 assert(lp_check_value(type, a));
1935 assert(util_cpu_caps.has_sse2);
1936
1937 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1938 if (type.length == 1) {
1939 LLVMTypeRef vec_type;
1940 LLVMValueRef undef;
1941 LLVMValueRef arg;
1942 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1943
1944 vec_type = LLVMVectorType(bld->elem_type, 4);
1945
1946 intrinsic = "llvm.x86.sse.cvtss2si";
1947
1948 undef = LLVMGetUndef(vec_type);
1949
1950 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1951
1952 res = lp_build_intrinsic_unary(builder, intrinsic,
1953 ret_type, arg);
1954 }
1955 else {
1956 if (type.width* type.length == 128) {
1957 intrinsic = "llvm.x86.sse2.cvtps2dq";
1958 }
1959 else {
1960 assert(type.width*type.length == 256);
1961 assert(util_cpu_caps.has_avx);
1962
1963 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1964 }
1965 res = lp_build_intrinsic_unary(builder, intrinsic,
1966 ret_type, a);
1967 }
1968
1969 return res;
1970 }
1971
1972
1973 /*
1974 */
1975 static inline LLVMValueRef
1976 lp_build_round_altivec(struct lp_build_context *bld,
1977 LLVMValueRef a,
1978 enum lp_build_round_mode mode)
1979 {
1980 LLVMBuilderRef builder = bld->gallivm->builder;
1981 const struct lp_type type = bld->type;
1982 const char *intrinsic = NULL;
1983
1984 assert(type.floating);
1985
1986 assert(lp_check_value(type, a));
1987 assert(util_cpu_caps.has_altivec);
1988
1989 (void)type;
1990
1991 switch (mode) {
1992 case LP_BUILD_ROUND_NEAREST:
1993 intrinsic = "llvm.ppc.altivec.vrfin";
1994 break;
1995 case LP_BUILD_ROUND_FLOOR:
1996 intrinsic = "llvm.ppc.altivec.vrfim";
1997 break;
1998 case LP_BUILD_ROUND_CEIL:
1999 intrinsic = "llvm.ppc.altivec.vrfip";
2000 break;
2001 case LP_BUILD_ROUND_TRUNCATE:
2002 intrinsic = "llvm.ppc.altivec.vrfiz";
2003 break;
2004 }
2005
2006 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2007 }
2008
2009 static inline LLVMValueRef
2010 lp_build_round_arch(struct lp_build_context *bld,
2011 LLVMValueRef a,
2012 enum lp_build_round_mode mode)
2013 {
2014 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2015 LLVMBuilderRef builder = bld->gallivm->builder;
2016 const struct lp_type type = bld->type;
2017 const char *intrinsic_root;
2018 char intrinsic[32];
2019
2020 assert(type.floating);
2021 assert(lp_check_value(type, a));
2022 (void)type;
2023
2024 switch (mode) {
2025 case LP_BUILD_ROUND_NEAREST:
2026 intrinsic_root = "llvm.nearbyint";
2027 break;
2028 case LP_BUILD_ROUND_FLOOR:
2029 intrinsic_root = "llvm.floor";
2030 break;
2031 case LP_BUILD_ROUND_CEIL:
2032 intrinsic_root = "llvm.ceil";
2033 break;
2034 case LP_BUILD_ROUND_TRUNCATE:
2035 intrinsic_root = "llvm.trunc";
2036 break;
2037 }
2038
2039 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2040 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2041 }
2042 else /* (util_cpu_caps.has_altivec) */
2043 return lp_build_round_altivec(bld, a, mode);
2044 }
2045
2046 /**
2047 * Return the integer part of a float (vector) value (== round toward zero).
2048 * The returned value is a float (vector).
2049 * Ex: trunc(-1.5) = -1.0
2050 */
2051 LLVMValueRef
2052 lp_build_trunc(struct lp_build_context *bld,
2053 LLVMValueRef a)
2054 {
2055 LLVMBuilderRef builder = bld->gallivm->builder;
2056 const struct lp_type type = bld->type;
2057
2058 assert(type.floating);
2059 assert(lp_check_value(type, a));
2060
2061 if (arch_rounding_available(type)) {
2062 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2063 }
2064 else {
2065 const struct lp_type type = bld->type;
2066 struct lp_type inttype;
2067 struct lp_build_context intbld;
2068 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2069 LLVMValueRef trunc, res, anosign, mask;
2070 LLVMTypeRef int_vec_type = bld->int_vec_type;
2071 LLVMTypeRef vec_type = bld->vec_type;
2072
2073 assert(type.width == 32); /* might want to handle doubles at some point */
2074
2075 inttype = type;
2076 inttype.floating = 0;
2077 lp_build_context_init(&intbld, bld->gallivm, inttype);
2078
2079 /* round by truncation */
2080 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2081 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2082
2083 /* mask out sign bit */
2084 anosign = lp_build_abs(bld, a);
2085 /*
2086 * mask out all values if anosign > 2^24
2087 * This should work both for large ints (all rounding is no-op for them
2088 * because such floats are always exact) as well as special cases like
2089 * NaNs, Infs (taking advantage of the fact they use max exponent).
2090 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2091 */
2092 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2093 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2094 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2095 return lp_build_select(bld, mask, a, res);
2096 }
2097 }
2098
2099
2100 /**
2101 * Return float (vector) rounded to nearest integer (vector). The returned
2102 * value is a float (vector).
2103 * Ex: round(0.9) = 1.0
2104 * Ex: round(-1.5) = -2.0
2105 */
2106 LLVMValueRef
2107 lp_build_round(struct lp_build_context *bld,
2108 LLVMValueRef a)
2109 {
2110 LLVMBuilderRef builder = bld->gallivm->builder;
2111 const struct lp_type type = bld->type;
2112
2113 assert(type.floating);
2114 assert(lp_check_value(type, a));
2115
2116 if (arch_rounding_available(type)) {
2117 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2118 }
2119 else {
2120 const struct lp_type type = bld->type;
2121 struct lp_type inttype;
2122 struct lp_build_context intbld;
2123 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2124 LLVMValueRef res, anosign, mask;
2125 LLVMTypeRef int_vec_type = bld->int_vec_type;
2126 LLVMTypeRef vec_type = bld->vec_type;
2127
2128 assert(type.width == 32); /* might want to handle doubles at some point */
2129
2130 inttype = type;
2131 inttype.floating = 0;
2132 lp_build_context_init(&intbld, bld->gallivm, inttype);
2133
2134 res = lp_build_iround(bld, a);
2135 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2136
2137 /* mask out sign bit */
2138 anosign = lp_build_abs(bld, a);
2139 /*
2140 * mask out all values if anosign > 2^24
2141 * This should work both for large ints (all rounding is no-op for them
2142 * because such floats are always exact) as well as special cases like
2143 * NaNs, Infs (taking advantage of the fact they use max exponent).
2144 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2145 */
2146 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2147 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2148 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2149 return lp_build_select(bld, mask, a, res);
2150 }
2151 }
2152
2153
2154 /**
2155 * Return floor of float (vector), result is a float (vector)
2156 * Ex: floor(1.1) = 1.0
2157 * Ex: floor(-1.1) = -2.0
2158 */
2159 LLVMValueRef
2160 lp_build_floor(struct lp_build_context *bld,
2161 LLVMValueRef a)
2162 {
2163 LLVMBuilderRef builder = bld->gallivm->builder;
2164 const struct lp_type type = bld->type;
2165
2166 assert(type.floating);
2167 assert(lp_check_value(type, a));
2168
2169 if (arch_rounding_available(type)) {
2170 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2171 }
2172 else {
2173 const struct lp_type type = bld->type;
2174 struct lp_type inttype;
2175 struct lp_build_context intbld;
2176 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2177 LLVMValueRef trunc, res, anosign, mask;
2178 LLVMTypeRef int_vec_type = bld->int_vec_type;
2179 LLVMTypeRef vec_type = bld->vec_type;
2180
2181 if (type.width != 32) {
2182 char intrinsic[32];
2183 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2184 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2185 }
2186
2187 assert(type.width == 32); /* might want to handle doubles at some point */
2188
2189 inttype = type;
2190 inttype.floating = 0;
2191 lp_build_context_init(&intbld, bld->gallivm, inttype);
2192
2193 /* round by truncation */
2194 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2195 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2196
2197 if (type.sign) {
2198 LLVMValueRef tmp;
2199
2200 /*
2201 * fix values if rounding is wrong (for non-special cases)
2202 * - this is the case if trunc > a
2203 */
2204 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2205 /* tmp = trunc > a ? 1.0 : 0.0 */
2206 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2207 tmp = lp_build_and(&intbld, mask, tmp);
2208 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2209 res = lp_build_sub(bld, res, tmp);
2210 }
2211
2212 /* mask out sign bit */
2213 anosign = lp_build_abs(bld, a);
2214 /*
2215 * mask out all values if anosign > 2^24
2216 * This should work both for large ints (all rounding is no-op for them
2217 * because such floats are always exact) as well as special cases like
2218 * NaNs, Infs (taking advantage of the fact they use max exponent).
2219 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2220 */
2221 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2222 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2223 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2224 return lp_build_select(bld, mask, a, res);
2225 }
2226 }
2227
2228
2229 /**
2230 * Return ceiling of float (vector), returning float (vector).
2231 * Ex: ceil( 1.1) = 2.0
2232 * Ex: ceil(-1.1) = -1.0
2233 */
2234 LLVMValueRef
2235 lp_build_ceil(struct lp_build_context *bld,
2236 LLVMValueRef a)
2237 {
2238 LLVMBuilderRef builder = bld->gallivm->builder;
2239 const struct lp_type type = bld->type;
2240
2241 assert(type.floating);
2242 assert(lp_check_value(type, a));
2243
2244 if (arch_rounding_available(type)) {
2245 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2246 }
2247 else {
2248 const struct lp_type type = bld->type;
2249 struct lp_type inttype;
2250 struct lp_build_context intbld;
2251 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2252 LLVMValueRef trunc, res, anosign, mask, tmp;
2253 LLVMTypeRef int_vec_type = bld->int_vec_type;
2254 LLVMTypeRef vec_type = bld->vec_type;
2255
2256 if (type.width != 32) {
2257 char intrinsic[32];
2258 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2259 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2260 }
2261
2262 assert(type.width == 32); /* might want to handle doubles at some point */
2263
2264 inttype = type;
2265 inttype.floating = 0;
2266 lp_build_context_init(&intbld, bld->gallivm, inttype);
2267
2268 /* round by truncation */
2269 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2270 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2271
2272 /*
2273 * fix values if rounding is wrong (for non-special cases)
2274 * - this is the case if trunc < a
2275 */
2276 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2277 /* tmp = trunc < a ? 1.0 : 0.0 */
2278 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2279 tmp = lp_build_and(&intbld, mask, tmp);
2280 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2281 res = lp_build_add(bld, trunc, tmp);
2282
2283 /* mask out sign bit */
2284 anosign = lp_build_abs(bld, a);
2285 /*
2286 * mask out all values if anosign > 2^24
2287 * This should work both for large ints (all rounding is no-op for them
2288 * because such floats are always exact) as well as special cases like
2289 * NaNs, Infs (taking advantage of the fact they use max exponent).
2290 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2291 */
2292 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2293 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2294 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2295 return lp_build_select(bld, mask, a, res);
2296 }
2297 }
2298
2299
2300 /**
2301 * Return fractional part of 'a' computed as a - floor(a)
2302 * Typically used in texture coord arithmetic.
2303 */
2304 LLVMValueRef
2305 lp_build_fract(struct lp_build_context *bld,
2306 LLVMValueRef a)
2307 {
2308 assert(bld->type.floating);
2309 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2310 }
2311
2312
2313 /**
2314 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2315 * against 0.99999(9). (Will also return that value for NaNs.)
2316 */
2317 static inline LLVMValueRef
2318 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2319 {
2320 LLVMValueRef max;
2321
2322 /* this is the largest number smaller than 1.0 representable as float */
2323 max = lp_build_const_vec(bld->gallivm, bld->type,
2324 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2325 return lp_build_min_ext(bld, fract, max,
2326 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2327 }
2328
2329
2330 /**
2331 * Same as lp_build_fract, but guarantees that the result is always smaller
2332 * than one. Will also return the smaller-than-one value for infs, NaNs.
2333 */
2334 LLVMValueRef
2335 lp_build_fract_safe(struct lp_build_context *bld,
2336 LLVMValueRef a)
2337 {
2338 return clamp_fract(bld, lp_build_fract(bld, a));
2339 }
2340
2341
2342 /**
2343 * Return the integer part of a float (vector) value (== round toward zero).
2344 * The returned value is an integer (vector).
2345 * Ex: itrunc(-1.5) = -1
2346 */
2347 LLVMValueRef
2348 lp_build_itrunc(struct lp_build_context *bld,
2349 LLVMValueRef a)
2350 {
2351 LLVMBuilderRef builder = bld->gallivm->builder;
2352 const struct lp_type type = bld->type;
2353 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2354
2355 assert(type.floating);
2356 assert(lp_check_value(type, a));
2357
2358 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2359 }
2360
2361
2362 /**
2363 * Return float (vector) rounded to nearest integer (vector). The returned
2364 * value is an integer (vector).
2365 * Ex: iround(0.9) = 1
2366 * Ex: iround(-1.5) = -2
2367 */
2368 LLVMValueRef
2369 lp_build_iround(struct lp_build_context *bld,
2370 LLVMValueRef a)
2371 {
2372 LLVMBuilderRef builder = bld->gallivm->builder;
2373 const struct lp_type type = bld->type;
2374 LLVMTypeRef int_vec_type = bld->int_vec_type;
2375 LLVMValueRef res;
2376
2377 assert(type.floating);
2378
2379 assert(lp_check_value(type, a));
2380
2381 if ((util_cpu_caps.has_sse2 &&
2382 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2383 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2384 return lp_build_iround_nearest_sse2(bld, a);
2385 }
2386 if (arch_rounding_available(type)) {
2387 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2388 }
2389 else {
2390 LLVMValueRef half;
2391
2392 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2393
2394 if (type.sign) {
2395 LLVMTypeRef vec_type = bld->vec_type;
2396 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2397 (unsigned long long)1 << (type.width - 1));
2398 LLVMValueRef sign;
2399
2400 /* get sign bit */
2401 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2402 sign = LLVMBuildAnd(builder, sign, mask, "");
2403
2404 /* sign * 0.5 */
2405 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2406 half = LLVMBuildOr(builder, sign, half, "");
2407 half = LLVMBuildBitCast(builder, half, vec_type, "");
2408 }
2409
2410 res = LLVMBuildFAdd(builder, a, half, "");
2411 }
2412
2413 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2414
2415 return res;
2416 }
2417
2418
2419 /**
2420 * Return floor of float (vector), result is an int (vector)
2421 * Ex: ifloor(1.1) = 1.0
2422 * Ex: ifloor(-1.1) = -2.0
2423 */
2424 LLVMValueRef
2425 lp_build_ifloor(struct lp_build_context *bld,
2426 LLVMValueRef a)
2427 {
2428 LLVMBuilderRef builder = bld->gallivm->builder;
2429 const struct lp_type type = bld->type;
2430 LLVMTypeRef int_vec_type = bld->int_vec_type;
2431 LLVMValueRef res;
2432
2433 assert(type.floating);
2434 assert(lp_check_value(type, a));
2435
2436 res = a;
2437 if (type.sign) {
2438 if (arch_rounding_available(type)) {
2439 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2440 }
2441 else {
2442 struct lp_type inttype;
2443 struct lp_build_context intbld;
2444 LLVMValueRef trunc, itrunc, mask;
2445
2446 assert(type.floating);
2447 assert(lp_check_value(type, a));
2448
2449 inttype = type;
2450 inttype.floating = 0;
2451 lp_build_context_init(&intbld, bld->gallivm, inttype);
2452
2453 /* round by truncation */
2454 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2455 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2456
2457 /*
2458 * fix values if rounding is wrong (for non-special cases)
2459 * - this is the case if trunc > a
2460 * The results of doing this with NaNs, very large values etc.
2461 * are undefined but this seems to be the case anyway.
2462 */
2463 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2464 /* cheapie minus one with mask since the mask is minus one / zero */
2465 return lp_build_add(&intbld, itrunc, mask);
2466 }
2467 }
2468
2469 /* round to nearest (toward zero) */
2470 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2471
2472 return res;
2473 }
2474
2475
2476 /**
2477 * Return ceiling of float (vector), returning int (vector).
2478 * Ex: iceil( 1.1) = 2
2479 * Ex: iceil(-1.1) = -1
2480 */
2481 LLVMValueRef
2482 lp_build_iceil(struct lp_build_context *bld,
2483 LLVMValueRef a)
2484 {
2485 LLVMBuilderRef builder = bld->gallivm->builder;
2486 const struct lp_type type = bld->type;
2487 LLVMTypeRef int_vec_type = bld->int_vec_type;
2488 LLVMValueRef res;
2489
2490 assert(type.floating);
2491 assert(lp_check_value(type, a));
2492
2493 if (arch_rounding_available(type)) {
2494 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2495 }
2496 else {
2497 struct lp_type inttype;
2498 struct lp_build_context intbld;
2499 LLVMValueRef trunc, itrunc, mask;
2500
2501 assert(type.floating);
2502 assert(lp_check_value(type, a));
2503
2504 inttype = type;
2505 inttype.floating = 0;
2506 lp_build_context_init(&intbld, bld->gallivm, inttype);
2507
2508 /* round by truncation */
2509 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2510 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2511
2512 /*
2513 * fix values if rounding is wrong (for non-special cases)
2514 * - this is the case if trunc < a
2515 * The results of doing this with NaNs, very large values etc.
2516 * are undefined but this seems to be the case anyway.
2517 */
2518 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2519 /* cheapie plus one with mask since the mask is minus one / zero */
2520 return lp_build_sub(&intbld, itrunc, mask);
2521 }
2522
2523 /* round to nearest (toward zero) */
2524 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2525
2526 return res;
2527 }
2528
2529
2530 /**
2531 * Combined ifloor() & fract().
2532 *
2533 * Preferred to calling the functions separately, as it will ensure that the
2534 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2535 */
2536 void
2537 lp_build_ifloor_fract(struct lp_build_context *bld,
2538 LLVMValueRef a,
2539 LLVMValueRef *out_ipart,
2540 LLVMValueRef *out_fpart)
2541 {
2542 LLVMBuilderRef builder = bld->gallivm->builder;
2543 const struct lp_type type = bld->type;
2544 LLVMValueRef ipart;
2545
2546 assert(type.floating);
2547 assert(lp_check_value(type, a));
2548
2549 if (arch_rounding_available(type)) {
2550 /*
2551 * floor() is easier.
2552 */
2553
2554 ipart = lp_build_floor(bld, a);
2555 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2556 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2557 }
2558 else {
2559 /*
2560 * ifloor() is easier.
2561 */
2562
2563 *out_ipart = lp_build_ifloor(bld, a);
2564 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2565 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2566 }
2567 }
2568
2569
2570 /**
2571 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2572 * always smaller than one.
2573 */
2574 void
2575 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2576 LLVMValueRef a,
2577 LLVMValueRef *out_ipart,
2578 LLVMValueRef *out_fpart)
2579 {
2580 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2581 *out_fpart = clamp_fract(bld, *out_fpart);
2582 }
2583
2584
2585 LLVMValueRef
2586 lp_build_sqrt(struct lp_build_context *bld,
2587 LLVMValueRef a)
2588 {
2589 LLVMBuilderRef builder = bld->gallivm->builder;
2590 const struct lp_type type = bld->type;
2591 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2592 char intrinsic[32];
2593
2594 assert(lp_check_value(type, a));
2595
2596 assert(type.floating);
2597 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2598
2599 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2600 }
2601
2602
2603 /**
2604 * Do one Newton-Raphson step to improve reciprocate precision:
2605 *
2606 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2607 *
2608 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2609 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2610 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2611 * halo. It would be necessary to clamp the argument to prevent this.
2612 *
2613 * See also:
2614 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2615 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2616 */
2617 static inline LLVMValueRef
2618 lp_build_rcp_refine(struct lp_build_context *bld,
2619 LLVMValueRef a,
2620 LLVMValueRef rcp_a)
2621 {
2622 LLVMBuilderRef builder = bld->gallivm->builder;
2623 LLVMValueRef neg_a;
2624 LLVMValueRef res;
2625
2626 neg_a = LLVMBuildFNeg(builder, a, "");
2627 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2628 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2629
2630 return res;
2631 }
2632
2633
2634 LLVMValueRef
2635 lp_build_rcp(struct lp_build_context *bld,
2636 LLVMValueRef a)
2637 {
2638 LLVMBuilderRef builder = bld->gallivm->builder;
2639 const struct lp_type type = bld->type;
2640
2641 assert(lp_check_value(type, a));
2642
2643 if(a == bld->zero)
2644 return bld->undef;
2645 if(a == bld->one)
2646 return bld->one;
2647 if(a == bld->undef)
2648 return bld->undef;
2649
2650 assert(type.floating);
2651
2652 if(LLVMIsConstant(a))
2653 return LLVMConstFDiv(bld->one, a);
2654
2655 /*
2656 * We don't use RCPPS because:
2657 * - it only has 10bits of precision
2658 * - it doesn't even get the reciprocate of 1.0 exactly
2659 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2660 * - for recent processors the benefit over DIVPS is marginal, a case
2661 * dependent
2662 *
2663 * We could still use it on certain processors if benchmarks show that the
2664 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2665 * particular uses that require less workarounds.
2666 */
2667
2668 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2669 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2670 const unsigned num_iterations = 0;
2671 LLVMValueRef res;
2672 unsigned i;
2673 const char *intrinsic = NULL;
2674
2675 if (type.length == 4) {
2676 intrinsic = "llvm.x86.sse.rcp.ps";
2677 }
2678 else {
2679 intrinsic = "llvm.x86.avx.rcp.ps.256";
2680 }
2681
2682 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2683
2684 for (i = 0; i < num_iterations; ++i) {
2685 res = lp_build_rcp_refine(bld, a, res);
2686 }
2687
2688 return res;
2689 }
2690
2691 return LLVMBuildFDiv(builder, bld->one, a, "");
2692 }
2693
2694
2695 /**
2696 * Do one Newton-Raphson step to improve rsqrt precision:
2697 *
2698 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2699 *
2700 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2701 */
2702 static inline LLVMValueRef
2703 lp_build_rsqrt_refine(struct lp_build_context *bld,
2704 LLVMValueRef a,
2705 LLVMValueRef rsqrt_a)
2706 {
2707 LLVMBuilderRef builder = bld->gallivm->builder;
2708 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2709 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2710 LLVMValueRef res;
2711
2712 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2713 res = LLVMBuildFMul(builder, a, res, "");
2714 res = LLVMBuildFSub(builder, three, res, "");
2715 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2716 res = LLVMBuildFMul(builder, half, res, "");
2717
2718 return res;
2719 }
2720
2721
2722 /**
2723 * Generate 1/sqrt(a).
2724 * Result is undefined for values < 0, infinity for +0.
2725 */
2726 LLVMValueRef
2727 lp_build_rsqrt(struct lp_build_context *bld,
2728 LLVMValueRef a)
2729 {
2730 const struct lp_type type = bld->type;
2731
2732 assert(lp_check_value(type, a));
2733
2734 assert(type.floating);
2735
2736 /*
2737 * This should be faster but all denormals will end up as infinity.
2738 */
2739 if (0 && lp_build_fast_rsqrt_available(type)) {
2740 const unsigned num_iterations = 1;
2741 LLVMValueRef res;
2742 unsigned i;
2743
2744 /* rsqrt(1.0) != 1.0 here */
2745 res = lp_build_fast_rsqrt(bld, a);
2746
2747 if (num_iterations) {
2748 /*
2749 * Newton-Raphson will result in NaN instead of infinity for zero,
2750 * and NaN instead of zero for infinity.
2751 * Also, need to ensure rsqrt(1.0) == 1.0.
2752 * All numbers smaller than FLT_MIN will result in +infinity
2753 * (rsqrtps treats all denormals as zero).
2754 */
2755 LLVMValueRef cmp;
2756 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2757 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2758
2759 for (i = 0; i < num_iterations; ++i) {
2760 res = lp_build_rsqrt_refine(bld, a, res);
2761 }
2762 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2763 res = lp_build_select(bld, cmp, inf, res);
2764 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2765 res = lp_build_select(bld, cmp, bld->zero, res);
2766 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2767 res = lp_build_select(bld, cmp, bld->one, res);
2768 }
2769
2770 return res;
2771 }
2772
2773 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2774 }
2775
2776 /**
2777 * If there's a fast (inaccurate) rsqrt instruction available
2778 * (caller may want to avoid to call rsqrt_fast if it's not available,
2779 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2780 * unavailable it would result in sqrt/div/mul so obviously
2781 * much better to just call sqrt, skipping both div and mul).
2782 */
2783 boolean
2784 lp_build_fast_rsqrt_available(struct lp_type type)
2785 {
2786 assert(type.floating);
2787
2788 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2789 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2790 return true;
2791 }
2792 return false;
2793 }
2794
2795
2796 /**
2797 * Generate 1/sqrt(a).
2798 * Result is undefined for values < 0, infinity for +0.
2799 * Precision is limited, only ~10 bits guaranteed
2800 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2801 */
2802 LLVMValueRef
2803 lp_build_fast_rsqrt(struct lp_build_context *bld,
2804 LLVMValueRef a)
2805 {
2806 LLVMBuilderRef builder = bld->gallivm->builder;
2807 const struct lp_type type = bld->type;
2808
2809 assert(lp_check_value(type, a));
2810
2811 if (lp_build_fast_rsqrt_available(type)) {
2812 const char *intrinsic = NULL;
2813
2814 if (type.length == 4) {
2815 intrinsic = "llvm.x86.sse.rsqrt.ps";
2816 }
2817 else {
2818 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2819 }
2820 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2821 }
2822 else {
2823 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2824 }
2825 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2826 }
2827
2828
2829 /**
2830 * Generate sin(a) or cos(a) using polynomial approximation.
2831 * TODO: it might be worth recognizing sin and cos using same source
2832 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2833 * would be way cheaper than calculating (nearly) everything twice...
2834 * Not sure it's common enough to be worth bothering however, scs
2835 * opcode could also benefit from calculating both though.
2836 */
2837 static LLVMValueRef
2838 lp_build_sin_or_cos(struct lp_build_context *bld,
2839 LLVMValueRef a,
2840 boolean cos)
2841 {
2842 struct gallivm_state *gallivm = bld->gallivm;
2843 LLVMBuilderRef b = gallivm->builder;
2844 struct lp_type int_type = lp_int_type(bld->type);
2845
2846 /*
2847 * take the absolute value,
2848 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2849 */
2850
2851 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2852 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2853
2854 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2855 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2856
2857 /*
2858 * scale by 4/Pi
2859 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2860 */
2861
2862 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2863 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2864
2865 /*
2866 * store the integer part of y in mm0
2867 * emm2 = _mm_cvttps_epi32(y);
2868 */
2869
2870 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2871
2872 /*
2873 * j=(j+1) & (~1) (see the cephes sources)
2874 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2875 */
2876
2877 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2878 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2879 /*
2880 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2881 */
2882 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2883 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2884
2885 /*
2886 * y = _mm_cvtepi32_ps(emm2);
2887 */
2888 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2889
2890 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2891 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2892 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2893 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2894
2895 /*
2896 * Argument used for poly selection and sign bit determination
2897 * is different for sin vs. cos.
2898 */
2899 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2900 emm2_and;
2901
2902 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2903 LLVMBuildNot(b, emm2_2, ""), ""),
2904 const_29, "sign_bit") :
2905 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2906 LLVMBuildShl(b, emm2_add,
2907 const_29, ""), ""),
2908 sign_mask, "sign_bit");
2909
2910 /*
2911 * get the polynom selection mask
2912 * there is one polynom for 0 <= x <= Pi/4
2913 * and another one for Pi/4<x<=Pi/2
2914 * Both branches will be computed.
2915 *
2916 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2917 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2918 */
2919
2920 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2921 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2922 int_type, PIPE_FUNC_EQUAL,
2923 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2924
2925 /*
2926 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2927 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2928 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2929 */
2930 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2931 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2932 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2933
2934 /*
2935 * The magic pass: "Extended precision modular arithmetic"
2936 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2937 */
2938 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2939 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2940 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2941
2942 /*
2943 * Evaluate the first polynom (0 <= x <= Pi/4)
2944 *
2945 * z = _mm_mul_ps(x,x);
2946 */
2947 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2948
2949 /*
2950 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2951 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2952 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2953 */
2954 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2955 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2956 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2957
2958 /*
2959 * y = *(v4sf*)_ps_coscof_p0;
2960 * y = _mm_mul_ps(y, z);
2961 */
2962 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2963 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2964 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2965 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2966
2967
2968 /*
2969 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2970 * y = _mm_sub_ps(y, tmp);
2971 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2972 */
2973 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2974 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2975 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2976 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2977 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2978
2979 /*
2980 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2981 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2982 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2983 */
2984 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2985 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2986 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2987
2988 /*
2989 * Evaluate the second polynom (Pi/4 <= x <= 0)
2990 *
2991 * y2 = *(v4sf*)_ps_sincof_p0;
2992 * y2 = _mm_mul_ps(y2, z);
2993 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2994 * y2 = _mm_mul_ps(y2, z);
2995 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2996 * y2 = _mm_mul_ps(y2, z);
2997 * y2 = _mm_mul_ps(y2, x);
2998 * y2 = _mm_add_ps(y2, x);
2999 */
3000
3001 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3002 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3003 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3004 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3005
3006 /*
3007 * select the correct result from the two polynoms
3008 * xmm3 = poly_mask;
3009 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3010 * y = _mm_andnot_ps(xmm3, y);
3011 * y = _mm_or_ps(y,y2);
3012 */
3013 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3014 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3015 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3016 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3017 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3018 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3019
3020 /*
3021 * update the sign
3022 * y = _mm_xor_ps(y, sign_bit);
3023 */
3024 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3025 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3026
3027 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3028
3029 /* clamp output to be within [-1, 1] */
3030 y_result = lp_build_clamp(bld, y_result,
3031 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3032 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3033 /* If a is -inf, inf or NaN then return NaN */
3034 y_result = lp_build_select(bld, isfinite, y_result,
3035 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3036 return y_result;
3037 }
3038
3039
3040 /**
3041 * Generate sin(a)
3042 */
3043 LLVMValueRef
3044 lp_build_sin(struct lp_build_context *bld,
3045 LLVMValueRef a)
3046 {
3047 return lp_build_sin_or_cos(bld, a, FALSE);
3048 }
3049
3050
3051 /**
3052 * Generate cos(a)
3053 */
3054 LLVMValueRef
3055 lp_build_cos(struct lp_build_context *bld,
3056 LLVMValueRef a)
3057 {
3058 return lp_build_sin_or_cos(bld, a, TRUE);
3059 }
3060
3061
3062 /**
3063 * Generate pow(x, y)
3064 */
3065 LLVMValueRef
3066 lp_build_pow(struct lp_build_context *bld,
3067 LLVMValueRef x,
3068 LLVMValueRef y)
3069 {
3070 /* TODO: optimize the constant case */
3071 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3072 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3073 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3074 __FUNCTION__);
3075 }
3076
3077 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3078 }
3079
3080
3081 /**
3082 * Generate exp(x)
3083 */
3084 LLVMValueRef
3085 lp_build_exp(struct lp_build_context *bld,
3086 LLVMValueRef x)
3087 {
3088 /* log2(e) = 1/log(2) */
3089 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3090 1.4426950408889634);
3091
3092 assert(lp_check_value(bld->type, x));
3093
3094 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3095 }
3096
3097
3098 /**
3099 * Generate log(x)
3100 * Behavior is undefined with infs, 0s and nans
3101 */
3102 LLVMValueRef
3103 lp_build_log(struct lp_build_context *bld,
3104 LLVMValueRef x)
3105 {
3106 /* log(2) */
3107 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3108 0.69314718055994529);
3109
3110 assert(lp_check_value(bld->type, x));
3111
3112 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3113 }
3114
3115 /**
3116 * Generate log(x) that handles edge cases (infs, 0s and nans)
3117 */
3118 LLVMValueRef
3119 lp_build_log_safe(struct lp_build_context *bld,
3120 LLVMValueRef x)
3121 {
3122 /* log(2) */
3123 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3124 0.69314718055994529);
3125
3126 assert(lp_check_value(bld->type, x));
3127
3128 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3129 }
3130
3131
3132 /**
3133 * Generate polynomial.
3134 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3135 */
3136 LLVMValueRef
3137 lp_build_polynomial(struct lp_build_context *bld,
3138 LLVMValueRef x,
3139 const double *coeffs,
3140 unsigned num_coeffs)
3141 {
3142 const struct lp_type type = bld->type;
3143 LLVMValueRef even = NULL, odd = NULL;
3144 LLVMValueRef x2;
3145 unsigned i;
3146
3147 assert(lp_check_value(bld->type, x));
3148
3149 /* TODO: optimize the constant case */
3150 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3151 LLVMIsConstant(x)) {
3152 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3153 __FUNCTION__);
3154 }
3155
3156 /*
3157 * Calculate odd and even terms seperately to decrease data dependency
3158 * Ex:
3159 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3160 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3161 */
3162 x2 = lp_build_mul(bld, x, x);
3163
3164 for (i = num_coeffs; i--; ) {
3165 LLVMValueRef coeff;
3166
3167 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3168
3169 if (i % 2 == 0) {
3170 if (even)
3171 even = lp_build_mad(bld, x2, even, coeff);
3172 else
3173 even = coeff;
3174 } else {
3175 if (odd)
3176 odd = lp_build_mad(bld, x2, odd, coeff);
3177 else
3178 odd = coeff;
3179 }
3180 }
3181
3182 if (odd)
3183 return lp_build_mad(bld, odd, x, even);
3184 else if (even)
3185 return even;
3186 else
3187 return bld->undef;
3188 }
3189
3190
3191 /**
3192 * Minimax polynomial fit of 2**x, in range [0, 1[
3193 */
3194 const double lp_build_exp2_polynomial[] = {
3195 #if EXP_POLY_DEGREE == 5
3196 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3197 0.693153073200168932794,
3198 0.240153617044375388211,
3199 0.0558263180532956664775,
3200 0.00898934009049466391101,
3201 0.00187757667519147912699
3202 #elif EXP_POLY_DEGREE == 4
3203 1.00000259337069434683,
3204 0.693003834469974940458,
3205 0.24144275689150793076,
3206 0.0520114606103070150235,
3207 0.0135341679161270268764
3208 #elif EXP_POLY_DEGREE == 3
3209 0.999925218562710312959,
3210 0.695833540494823811697,
3211 0.226067155427249155588,
3212 0.0780245226406372992967
3213 #elif EXP_POLY_DEGREE == 2
3214 1.00172476321474503578,
3215 0.657636275736077639316,
3216 0.33718943461968720704
3217 #else
3218 #error
3219 #endif
3220 };
3221
3222
3223 LLVMValueRef
3224 lp_build_exp2(struct lp_build_context *bld,
3225 LLVMValueRef x)
3226 {
3227 LLVMBuilderRef builder = bld->gallivm->builder;
3228 const struct lp_type type = bld->type;
3229 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3230 LLVMValueRef ipart = NULL;
3231 LLVMValueRef fpart = NULL;
3232 LLVMValueRef expipart = NULL;
3233 LLVMValueRef expfpart = NULL;
3234 LLVMValueRef res = NULL;
3235
3236 assert(lp_check_value(bld->type, x));
3237
3238 /* TODO: optimize the constant case */
3239 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3240 LLVMIsConstant(x)) {
3241 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3242 __FUNCTION__);
3243 }
3244
3245 assert(type.floating && type.width == 32);
3246
3247 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3248 * the result is INF and if it's smaller than -126.9 the result is 0 */
3249 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3250 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3251 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3252 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3253
3254 /* ipart = floor(x) */
3255 /* fpart = x - ipart */
3256 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3257
3258 /* expipart = (float) (1 << ipart) */
3259 expipart = LLVMBuildAdd(builder, ipart,
3260 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3261 expipart = LLVMBuildShl(builder, expipart,
3262 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3263 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3264
3265 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3266 ARRAY_SIZE(lp_build_exp2_polynomial));
3267
3268 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3269
3270 return res;
3271 }
3272
3273
3274
3275 /**
3276 * Extract the exponent of a IEEE-754 floating point value.
3277 *
3278 * Optionally apply an integer bias.
3279 *
3280 * Result is an integer value with
3281 *
3282 * ifloor(log2(x)) + bias
3283 */
3284 LLVMValueRef
3285 lp_build_extract_exponent(struct lp_build_context *bld,
3286 LLVMValueRef x,
3287 int bias)
3288 {
3289 LLVMBuilderRef builder = bld->gallivm->builder;
3290 const struct lp_type type = bld->type;
3291 unsigned mantissa = lp_mantissa(type);
3292 LLVMValueRef res;
3293
3294 assert(type.floating);
3295
3296 assert(lp_check_value(bld->type, x));
3297
3298 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3299
3300 res = LLVMBuildLShr(builder, x,
3301 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3302 res = LLVMBuildAnd(builder, res,
3303 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3304 res = LLVMBuildSub(builder, res,
3305 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3306
3307 return res;
3308 }
3309
3310
3311 /**
3312 * Extract the mantissa of the a floating.
3313 *
3314 * Result is a floating point value with
3315 *
3316 * x / floor(log2(x))
3317 */
3318 LLVMValueRef
3319 lp_build_extract_mantissa(struct lp_build_context *bld,
3320 LLVMValueRef x)
3321 {
3322 LLVMBuilderRef builder = bld->gallivm->builder;
3323 const struct lp_type type = bld->type;
3324 unsigned mantissa = lp_mantissa(type);
3325 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3326 (1ULL << mantissa) - 1);
3327 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3328 LLVMValueRef res;
3329
3330 assert(lp_check_value(bld->type, x));
3331
3332 assert(type.floating);
3333
3334 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3335
3336 /* res = x / 2**ipart */
3337 res = LLVMBuildAnd(builder, x, mantmask, "");
3338 res = LLVMBuildOr(builder, res, one, "");
3339 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3340
3341 return res;
3342 }
3343
3344
3345
3346 /**
3347 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3348 * These coefficients can be generate with
3349 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3350 */
3351 const double lp_build_log2_polynomial[] = {
3352 #if LOG_POLY_DEGREE == 5
3353 2.88539008148777786488L,
3354 0.961796878841293367824L,
3355 0.577058946784739859012L,
3356 0.412914355135828735411L,
3357 0.308591899232910175289L,
3358 0.352376952300281371868L,
3359 #elif LOG_POLY_DEGREE == 4
3360 2.88539009343309178325L,
3361 0.961791550404184197881L,
3362 0.577440339438736392009L,
3363 0.403343858251329912514L,
3364 0.406718052498846252698L,
3365 #elif LOG_POLY_DEGREE == 3
3366 2.88538959748872753838L,
3367 0.961932915889597772928L,
3368 0.571118517972136195241L,
3369 0.493997535084709500285L,
3370 #else
3371 #error
3372 #endif
3373 };
3374
3375 /**
3376 * See http://www.devmaster.net/forums/showthread.php?p=43580
3377 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3378 * http://www.nezumi.demon.co.uk/consult/logx.htm
3379 *
3380 * If handle_edge_cases is true the function will perform computations
3381 * to match the required D3D10+ behavior for each of the edge cases.
3382 * That means that if input is:
3383 * - less than zero (to and including -inf) then NaN will be returned
3384 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3385 * - +infinity, then +infinity will be returned
3386 * - NaN, then NaN will be returned
3387 *
3388 * Those checks are fairly expensive so if you don't need them make sure
3389 * handle_edge_cases is false.
3390 */
3391 void
3392 lp_build_log2_approx(struct lp_build_context *bld,
3393 LLVMValueRef x,
3394 LLVMValueRef *p_exp,
3395 LLVMValueRef *p_floor_log2,
3396 LLVMValueRef *p_log2,
3397 boolean handle_edge_cases)
3398 {
3399 LLVMBuilderRef builder = bld->gallivm->builder;
3400 const struct lp_type type = bld->type;
3401 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3402 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3403
3404 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3405 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3406 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3407
3408 LLVMValueRef i = NULL;
3409 LLVMValueRef y = NULL;
3410 LLVMValueRef z = NULL;
3411 LLVMValueRef exp = NULL;
3412 LLVMValueRef mant = NULL;
3413 LLVMValueRef logexp = NULL;
3414 LLVMValueRef p_z = NULL;
3415 LLVMValueRef res = NULL;
3416
3417 assert(lp_check_value(bld->type, x));
3418
3419 if(p_exp || p_floor_log2 || p_log2) {
3420 /* TODO: optimize the constant case */
3421 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3422 LLVMIsConstant(x)) {
3423 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3424 __FUNCTION__);
3425 }
3426
3427 assert(type.floating && type.width == 32);
3428
3429 /*
3430 * We don't explicitly handle denormalized numbers. They will yield a
3431 * result in the neighbourhood of -127, which appears to be adequate
3432 * enough.
3433 */
3434
3435 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3436
3437 /* exp = (float) exponent(x) */
3438 exp = LLVMBuildAnd(builder, i, expmask, "");
3439 }
3440
3441 if(p_floor_log2 || p_log2) {
3442 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3443 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3444 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3445 }
3446
3447 if (p_log2) {
3448 /* mant = 1 + (float) mantissa(x) */
3449 mant = LLVMBuildAnd(builder, i, mantmask, "");
3450 mant = LLVMBuildOr(builder, mant, one, "");
3451 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3452
3453 /* y = (mant - 1) / (mant + 1) */
3454 y = lp_build_div(bld,
3455 lp_build_sub(bld, mant, bld->one),
3456 lp_build_add(bld, mant, bld->one)
3457 );
3458
3459 /* z = y^2 */
3460 z = lp_build_mul(bld, y, y);
3461
3462 /* compute P(z) */
3463 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3464 ARRAY_SIZE(lp_build_log2_polynomial));
3465
3466 /* y * P(z) + logexp */
3467 res = lp_build_mad(bld, y, p_z, logexp);
3468
3469 if (type.floating && handle_edge_cases) {
3470 LLVMValueRef negmask, infmask, zmask;
3471 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3472 lp_build_const_vec(bld->gallivm, type, 0.0f));
3473 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3474 lp_build_const_vec(bld->gallivm, type, 0.0f));
3475 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3476 lp_build_const_vec(bld->gallivm, type, INFINITY));
3477
3478 /* If x is qual to inf make sure we return inf */
3479 res = lp_build_select(bld, infmask,
3480 lp_build_const_vec(bld->gallivm, type, INFINITY),
3481 res);
3482 /* If x is qual to 0, return -inf */
3483 res = lp_build_select(bld, zmask,
3484 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3485 res);
3486 /* If x is nan or less than 0, return nan */
3487 res = lp_build_select(bld, negmask,
3488 lp_build_const_vec(bld->gallivm, type, NAN),
3489 res);
3490 }
3491 }
3492
3493 if (p_exp) {
3494 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3495 *p_exp = exp;
3496 }
3497
3498 if (p_floor_log2)
3499 *p_floor_log2 = logexp;
3500
3501 if (p_log2)
3502 *p_log2 = res;
3503 }
3504
3505
3506 /*
3507 * log2 implementation which doesn't have special code to
3508 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3509 * the results for those cases are undefined.
3510 */
3511 LLVMValueRef
3512 lp_build_log2(struct lp_build_context *bld,
3513 LLVMValueRef x)
3514 {
3515 LLVMValueRef res;
3516 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3517 return res;
3518 }
3519
3520 /*
3521 * Version of log2 which handles all edge cases.
3522 * Look at documentation of lp_build_log2_approx for
3523 * description of the behavior for each of the edge cases.
3524 */
3525 LLVMValueRef
3526 lp_build_log2_safe(struct lp_build_context *bld,
3527 LLVMValueRef x)
3528 {
3529 LLVMValueRef res;
3530 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3531 return res;
3532 }
3533
3534
3535 /**
3536 * Faster (and less accurate) log2.
3537 *
3538 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3539 *
3540 * Piece-wise linear approximation, with exact results when x is a
3541 * power of two.
3542 *
3543 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3544 */
3545 LLVMValueRef
3546 lp_build_fast_log2(struct lp_build_context *bld,
3547 LLVMValueRef x)
3548 {
3549 LLVMBuilderRef builder = bld->gallivm->builder;
3550 LLVMValueRef ipart;
3551 LLVMValueRef fpart;
3552
3553 assert(lp_check_value(bld->type, x));
3554
3555 assert(bld->type.floating);
3556
3557 /* ipart = floor(log2(x)) - 1 */
3558 ipart = lp_build_extract_exponent(bld, x, -1);
3559 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3560
3561 /* fpart = x / 2**ipart */
3562 fpart = lp_build_extract_mantissa(bld, x);
3563
3564 /* ipart + fpart */
3565 return LLVMBuildFAdd(builder, ipart, fpart, "");
3566 }
3567
3568
3569 /**
3570 * Fast implementation of iround(log2(x)).
3571 *
3572 * Not an approximation -- it should give accurate results all the time.
3573 */
3574 LLVMValueRef
3575 lp_build_ilog2(struct lp_build_context *bld,
3576 LLVMValueRef x)
3577 {
3578 LLVMBuilderRef builder = bld->gallivm->builder;
3579 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3580 LLVMValueRef ipart;
3581
3582 assert(bld->type.floating);
3583
3584 assert(lp_check_value(bld->type, x));
3585
3586 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3587 x = LLVMBuildFMul(builder, x, sqrt2, "");
3588
3589 /* ipart = floor(log2(x) + 0.5) */
3590 ipart = lp_build_extract_exponent(bld, x, 0);
3591
3592 return ipart;
3593 }
3594
3595 LLVMValueRef
3596 lp_build_mod(struct lp_build_context *bld,
3597 LLVMValueRef x,
3598 LLVMValueRef y)
3599 {
3600 LLVMBuilderRef builder = bld->gallivm->builder;
3601 LLVMValueRef res;
3602 const struct lp_type type = bld->type;
3603
3604 assert(lp_check_value(type, x));
3605 assert(lp_check_value(type, y));
3606
3607 if (type.floating)
3608 res = LLVMBuildFRem(builder, x, y, "");
3609 else if (type.sign)
3610 res = LLVMBuildSRem(builder, x, y, "");
3611 else
3612 res = LLVMBuildURem(builder, x, y, "");
3613 return res;
3614 }
3615
3616
3617 /*
3618 * For floating inputs it creates and returns a mask
3619 * which is all 1's for channels which are NaN.
3620 * Channels inside x which are not NaN will be 0.
3621 */
3622 LLVMValueRef
3623 lp_build_isnan(struct lp_build_context *bld,
3624 LLVMValueRef x)
3625 {
3626 LLVMValueRef mask;
3627 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3628
3629 assert(bld->type.floating);
3630 assert(lp_check_value(bld->type, x));
3631
3632 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3633 "isnotnan");
3634 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3635 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3636 return mask;
3637 }
3638
3639 /* Returns all 1's for floating point numbers that are
3640 * finite numbers and returns all zeros for -inf,
3641 * inf and nan's */
3642 LLVMValueRef
3643 lp_build_isfinite(struct lp_build_context *bld,
3644 LLVMValueRef x)
3645 {
3646 LLVMBuilderRef builder = bld->gallivm->builder;
3647 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3648 struct lp_type int_type = lp_int_type(bld->type);
3649 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3650 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3651 0x7f800000);
3652
3653 if (!bld->type.floating) {
3654 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3655 }
3656 assert(bld->type.floating);
3657 assert(lp_check_value(bld->type, x));
3658 assert(bld->type.width == 32);
3659
3660 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3661 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3662 intx, infornan32);
3663 }
3664
3665 /*
3666 * Returns true if the number is nan or inf and false otherwise.
3667 * The input has to be a floating point vector.
3668 */
3669 LLVMValueRef
3670 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3671 const struct lp_type type,
3672 LLVMValueRef x)
3673 {
3674 LLVMBuilderRef builder = gallivm->builder;
3675 struct lp_type int_type = lp_int_type(type);
3676 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3677 0x7f800000);
3678 LLVMValueRef ret;
3679
3680 assert(type.floating);
3681
3682 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3683 ret = LLVMBuildAnd(builder, ret, const0, "");
3684 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3685 ret, const0);
3686
3687 return ret;
3688 }
3689
3690
3691 LLVMValueRef
3692 lp_build_fpstate_get(struct gallivm_state *gallivm)
3693 {
3694 if (util_cpu_caps.has_sse) {
3695 LLVMBuilderRef builder = gallivm->builder;
3696 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3697 gallivm,
3698 LLVMInt32TypeInContext(gallivm->context),
3699 "mxcsr_ptr");
3700 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3701 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3702 lp_build_intrinsic(builder,
3703 "llvm.x86.sse.stmxcsr",
3704 LLVMVoidTypeInContext(gallivm->context),
3705 &mxcsr_ptr8, 1, 0);
3706 return mxcsr_ptr;
3707 }
3708 return 0;
3709 }
3710
3711 void
3712 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3713 boolean zero)
3714 {
3715 if (util_cpu_caps.has_sse) {
3716 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3717 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3718
3719 LLVMBuilderRef builder = gallivm->builder;
3720 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3721 LLVMValueRef mxcsr =
3722 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3723
3724 if (util_cpu_caps.has_daz) {
3725 /* Enable denormals are zero mode */
3726 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3727 }
3728 if (zero) {
3729 mxcsr = LLVMBuildOr(builder, mxcsr,
3730 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3731 } else {
3732 mxcsr = LLVMBuildAnd(builder, mxcsr,
3733 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3734 }
3735
3736 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3737 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3738 }
3739 }
3740
3741 void
3742 lp_build_fpstate_set(struct gallivm_state *gallivm,
3743 LLVMValueRef mxcsr_ptr)
3744 {
3745 if (util_cpu_caps.has_sse) {
3746 LLVMBuilderRef builder = gallivm->builder;
3747 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3748 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3749 lp_build_intrinsic(builder,
3750 "llvm.x86.sse.ldmxcsr",
3751 LLVMVoidTypeInContext(gallivm->context),
3752 &mxcsr_ptr, 1, 0);
3753 }
3754 }