Added few more stubs so that control reaches to DestroyDevice().
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_cpu_caps.has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_cpu_caps.has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_cpu_caps.has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_cpu_caps.has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
139 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
140 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 __FUNCTION__);
142 }
143 if (type.width == 32 && type.length == 4) {
144 intrinsic = "llvm.ppc.altivec.vminfp";
145 intr_size = 128;
146 }
147 } else if (util_cpu_caps.has_altivec) {
148 intr_size = 128;
149 if (type.width == 8) {
150 if (!type.sign) {
151 intrinsic = "llvm.ppc.altivec.vminub";
152 } else {
153 intrinsic = "llvm.ppc.altivec.vminsb";
154 }
155 } else if (type.width == 16) {
156 if (!type.sign) {
157 intrinsic = "llvm.ppc.altivec.vminuh";
158 } else {
159 intrinsic = "llvm.ppc.altivec.vminsh";
160 }
161 } else if (type.width == 32) {
162 if (!type.sign) {
163 intrinsic = "llvm.ppc.altivec.vminuw";
164 } else {
165 intrinsic = "llvm.ppc.altivec.vminsw";
166 }
167 }
168 }
169
170 if (intrinsic) {
171 /* We need to handle nan's for floating point numbers. If one of the
172 * inputs is nan the other should be returned (required by both D3D10+
173 * and OpenCL).
174 * The sse intrinsics return the second operator in case of nan by
175 * default so we need to special code to handle those.
176 */
177 if (util_cpu_caps.has_sse && type.floating &&
178 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
179 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
180 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
181 LLVMValueRef isnan, min;
182 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
183 type,
184 intr_size, a, b);
185 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
186 isnan = lp_build_isnan(bld, b);
187 return lp_build_select(bld, isnan, a, min);
188 } else {
189 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
190 isnan = lp_build_isnan(bld, a);
191 return lp_build_select(bld, isnan, a, min);
192 }
193 } else {
194 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
195 type,
196 intr_size, a, b);
197 }
198 }
199
200 if (type.floating) {
201 switch (nan_behavior) {
202 case GALLIVM_NAN_RETURN_NAN: {
203 LLVMValueRef isnan = lp_build_isnan(bld, b);
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
205 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
206 return lp_build_select(bld, cond, a, b);
207 }
208 break;
209 case GALLIVM_NAN_RETURN_OTHER: {
210 LLVMValueRef isnan = lp_build_isnan(bld, a);
211 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
212 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
213 return lp_build_select(bld, cond, a, b);
214 }
215 break;
216 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
217 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
218 return lp_build_select(bld, cond, a, b);
219 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
220 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
221 return lp_build_select(bld, cond, b, a);
222 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
223 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
224 return lp_build_select(bld, cond, a, b);
225 break;
226 default:
227 assert(0);
228 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
229 return lp_build_select(bld, cond, a, b);
230 }
231 } else {
232 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
233 return lp_build_select(bld, cond, a, b);
234 }
235 }
236
237
238 LLVMValueRef
239 lp_build_fmuladd(LLVMBuilderRef builder,
240 LLVMValueRef a,
241 LLVMValueRef b,
242 LLVMValueRef c)
243 {
244 LLVMTypeRef type = LLVMTypeOf(a);
245 assert(type == LLVMTypeOf(b));
246 assert(type == LLVMTypeOf(c));
247
248 char intrinsic[32];
249 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
250 LLVMValueRef args[] = { a, b, c };
251 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
252 }
253
254
255 /**
256 * Generate max(a, b)
257 * No checks for special case values of a or b = 1 or 0 are done.
258 * NaN's are handled according to the behavior specified by the
259 * nan_behavior argument.
260 */
261 static LLVMValueRef
262 lp_build_max_simple(struct lp_build_context *bld,
263 LLVMValueRef a,
264 LLVMValueRef b,
265 enum gallivm_nan_behavior nan_behavior)
266 {
267 const struct lp_type type = bld->type;
268 const char *intrinsic = NULL;
269 unsigned intr_size = 0;
270 LLVMValueRef cond;
271
272 assert(lp_check_value(type, a));
273 assert(lp_check_value(type, b));
274
275 /* TODO: optimize the constant case */
276
277 if (type.floating && util_cpu_caps.has_sse) {
278 if (type.width == 32) {
279 if (type.length == 1) {
280 intrinsic = "llvm.x86.sse.max.ss";
281 intr_size = 128;
282 }
283 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
284 intrinsic = "llvm.x86.sse.max.ps";
285 intr_size = 128;
286 }
287 else {
288 intrinsic = "llvm.x86.avx.max.ps.256";
289 intr_size = 256;
290 }
291 }
292 if (type.width == 64 && util_cpu_caps.has_sse2) {
293 if (type.length == 1) {
294 intrinsic = "llvm.x86.sse2.max.sd";
295 intr_size = 128;
296 }
297 else if (type.length == 2 || !util_cpu_caps.has_avx) {
298 intrinsic = "llvm.x86.sse2.max.pd";
299 intr_size = 128;
300 }
301 else {
302 intrinsic = "llvm.x86.avx.max.pd.256";
303 intr_size = 256;
304 }
305 }
306 }
307 else if (type.floating && util_cpu_caps.has_altivec) {
308 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
309 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
310 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
311 __FUNCTION__);
312 }
313 if (type.width == 32 || type.length == 4) {
314 intrinsic = "llvm.ppc.altivec.vmaxfp";
315 intr_size = 128;
316 }
317 } else if (util_cpu_caps.has_altivec) {
318 intr_size = 128;
319 if (type.width == 8) {
320 if (!type.sign) {
321 intrinsic = "llvm.ppc.altivec.vmaxub";
322 } else {
323 intrinsic = "llvm.ppc.altivec.vmaxsb";
324 }
325 } else if (type.width == 16) {
326 if (!type.sign) {
327 intrinsic = "llvm.ppc.altivec.vmaxuh";
328 } else {
329 intrinsic = "llvm.ppc.altivec.vmaxsh";
330 }
331 } else if (type.width == 32) {
332 if (!type.sign) {
333 intrinsic = "llvm.ppc.altivec.vmaxuw";
334 } else {
335 intrinsic = "llvm.ppc.altivec.vmaxsw";
336 }
337 }
338 }
339
340 if (intrinsic) {
341 if (util_cpu_caps.has_sse && type.floating &&
342 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
343 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
344 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
345 LLVMValueRef isnan, max;
346 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
347 type,
348 intr_size, a, b);
349 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
350 isnan = lp_build_isnan(bld, b);
351 return lp_build_select(bld, isnan, a, max);
352 } else {
353 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
354 isnan = lp_build_isnan(bld, a);
355 return lp_build_select(bld, isnan, a, max);
356 }
357 } else {
358 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
359 type,
360 intr_size, a, b);
361 }
362 }
363
364 if (type.floating) {
365 switch (nan_behavior) {
366 case GALLIVM_NAN_RETURN_NAN: {
367 LLVMValueRef isnan = lp_build_isnan(bld, b);
368 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
369 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
370 return lp_build_select(bld, cond, a, b);
371 }
372 break;
373 case GALLIVM_NAN_RETURN_OTHER: {
374 LLVMValueRef isnan = lp_build_isnan(bld, a);
375 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
376 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
377 return lp_build_select(bld, cond, a, b);
378 }
379 break;
380 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
381 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
382 return lp_build_select(bld, cond, a, b);
383 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
384 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
385 return lp_build_select(bld, cond, b, a);
386 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
387 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
388 return lp_build_select(bld, cond, a, b);
389 break;
390 default:
391 assert(0);
392 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
393 return lp_build_select(bld, cond, a, b);
394 }
395 } else {
396 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397 return lp_build_select(bld, cond, a, b);
398 }
399 }
400
401
402 /**
403 * Generate 1 - a, or ~a depending on bld->type.
404 */
405 LLVMValueRef
406 lp_build_comp(struct lp_build_context *bld,
407 LLVMValueRef a)
408 {
409 LLVMBuilderRef builder = bld->gallivm->builder;
410 const struct lp_type type = bld->type;
411
412 assert(lp_check_value(type, a));
413
414 if(a == bld->one)
415 return bld->zero;
416 if(a == bld->zero)
417 return bld->one;
418
419 if(type.norm && !type.floating && !type.fixed && !type.sign) {
420 if(LLVMIsConstant(a))
421 return LLVMConstNot(a);
422 else
423 return LLVMBuildNot(builder, a, "");
424 }
425
426 if(LLVMIsConstant(a))
427 if (type.floating)
428 return LLVMConstFSub(bld->one, a);
429 else
430 return LLVMConstSub(bld->one, a);
431 else
432 if (type.floating)
433 return LLVMBuildFSub(builder, bld->one, a, "");
434 else
435 return LLVMBuildSub(builder, bld->one, a, "");
436 }
437
438
439 /**
440 * Generate a + b
441 */
442 LLVMValueRef
443 lp_build_add(struct lp_build_context *bld,
444 LLVMValueRef a,
445 LLVMValueRef b)
446 {
447 LLVMBuilderRef builder = bld->gallivm->builder;
448 const struct lp_type type = bld->type;
449 LLVMValueRef res;
450
451 assert(lp_check_value(type, a));
452 assert(lp_check_value(type, b));
453
454 if (a == bld->zero)
455 return b;
456 if (b == bld->zero)
457 return a;
458 if (a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if (type.norm) {
462 const char *intrinsic = NULL;
463
464 if (!type.sign && (a == bld->one || b == bld->one))
465 return bld->one;
466
467 if (!type.floating && !type.fixed) {
468 if (LLVM_VERSION_MAJOR >= 8) {
469 char intrin[32];
470 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
471 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
472 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
473 }
474 if (type.width * type.length == 128) {
475 if (util_cpu_caps.has_sse2) {
476 if (type.width == 8)
477 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
478 if (type.width == 16)
479 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
480 } else if (util_cpu_caps.has_altivec) {
481 if (type.width == 8)
482 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
483 if (type.width == 16)
484 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
485 }
486 }
487 if (type.width * type.length == 256) {
488 if (util_cpu_caps.has_avx2) {
489 if (type.width == 8)
490 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
491 if (type.width == 16)
492 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
493 }
494 }
495 }
496
497 if (intrinsic)
498 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
499 }
500
501 if(type.norm && !type.floating && !type.fixed) {
502 if (type.sign) {
503 uint64_t sign = (uint64_t)1 << (type.width - 1);
504 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
505 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
506 /* a_clamp_max is the maximum a for positive b,
507 a_clamp_min is the minimum a for negative b. */
508 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
509 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
510 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
511 }
512 }
513
514 if(LLVMIsConstant(a) && LLVMIsConstant(b))
515 if (type.floating)
516 res = LLVMConstFAdd(a, b);
517 else
518 res = LLVMConstAdd(a, b);
519 else
520 if (type.floating)
521 res = LLVMBuildFAdd(builder, a, b, "");
522 else
523 res = LLVMBuildAdd(builder, a, b, "");
524
525 /* clamp to ceiling of 1.0 */
526 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
527 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
528
529 if (type.norm && !type.floating && !type.fixed) {
530 if (!type.sign) {
531 /*
532 * newer llvm versions no longer support the intrinsics, but recognize
533 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
534 * code, it is important we match the pattern llvm uses (and pray llvm
535 * doesn't change it - and hope they decide on the same pattern for
536 * all backends supporting it...).
537 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
538 * interfere with llvm's ability to recognize the pattern but seems
539 * a bit brittle.
540 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
541 */
542 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
543 res = lp_build_select(bld, overflowed,
544 LLVMConstAllOnes(bld->int_vec_type), res);
545 }
546 }
547
548 /* XXX clamp to floor of -1 or 0??? */
549
550 return res;
551 }
552
553
554 /** Return the scalar sum of the elements of a.
555 * Should avoid this operation whenever possible.
556 */
557 LLVMValueRef
558 lp_build_horizontal_add(struct lp_build_context *bld,
559 LLVMValueRef a)
560 {
561 LLVMBuilderRef builder = bld->gallivm->builder;
562 const struct lp_type type = bld->type;
563 LLVMValueRef index, res;
564 unsigned i, length;
565 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
566 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
567 LLVMValueRef vecres, elem2;
568
569 assert(lp_check_value(type, a));
570
571 if (type.length == 1) {
572 return a;
573 }
574
575 assert(!bld->type.norm);
576
577 /*
578 * for byte vectors can do much better with psadbw.
579 * Using repeated shuffle/adds here. Note with multiple vectors
580 * this can be done more efficiently as outlined in the intel
581 * optimization manual.
582 * Note: could cause data rearrangement if used with smaller element
583 * sizes.
584 */
585
586 vecres = a;
587 length = type.length / 2;
588 while (length > 1) {
589 LLVMValueRef vec1, vec2;
590 for (i = 0; i < length; i++) {
591 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
592 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
593 }
594 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
595 LLVMConstVector(shuffles1, length), "");
596 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
597 LLVMConstVector(shuffles2, length), "");
598 if (type.floating) {
599 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
600 }
601 else {
602 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
603 }
604 length = length >> 1;
605 }
606
607 /* always have vector of size 2 here */
608 assert(length == 1);
609
610 index = lp_build_const_int32(bld->gallivm, 0);
611 res = LLVMBuildExtractElement(builder, vecres, index, "");
612 index = lp_build_const_int32(bld->gallivm, 1);
613 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
614
615 if (type.floating)
616 res = LLVMBuildFAdd(builder, res, elem2, "");
617 else
618 res = LLVMBuildAdd(builder, res, elem2, "");
619
620 return res;
621 }
622
623 /**
624 * Return the horizontal sums of 4 float vectors as a float4 vector.
625 * This uses the technique as outlined in Intel Optimization Manual.
626 */
627 static LLVMValueRef
628 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
629 LLVMValueRef src[4])
630 {
631 struct gallivm_state *gallivm = bld->gallivm;
632 LLVMBuilderRef builder = gallivm->builder;
633 LLVMValueRef shuffles[4];
634 LLVMValueRef tmp[4];
635 LLVMValueRef sumtmp[2], shuftmp[2];
636
637 /* lower half of regs */
638 shuffles[0] = lp_build_const_int32(gallivm, 0);
639 shuffles[1] = lp_build_const_int32(gallivm, 1);
640 shuffles[2] = lp_build_const_int32(gallivm, 4);
641 shuffles[3] = lp_build_const_int32(gallivm, 5);
642 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
643 LLVMConstVector(shuffles, 4), "");
644 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
645 LLVMConstVector(shuffles, 4), "");
646
647 /* upper half of regs */
648 shuffles[0] = lp_build_const_int32(gallivm, 2);
649 shuffles[1] = lp_build_const_int32(gallivm, 3);
650 shuffles[2] = lp_build_const_int32(gallivm, 6);
651 shuffles[3] = lp_build_const_int32(gallivm, 7);
652 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
653 LLVMConstVector(shuffles, 4), "");
654 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
655 LLVMConstVector(shuffles, 4), "");
656
657 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
658 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
659
660 shuffles[0] = lp_build_const_int32(gallivm, 0);
661 shuffles[1] = lp_build_const_int32(gallivm, 2);
662 shuffles[2] = lp_build_const_int32(gallivm, 4);
663 shuffles[3] = lp_build_const_int32(gallivm, 6);
664 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
665 LLVMConstVector(shuffles, 4), "");
666
667 shuffles[0] = lp_build_const_int32(gallivm, 1);
668 shuffles[1] = lp_build_const_int32(gallivm, 3);
669 shuffles[2] = lp_build_const_int32(gallivm, 5);
670 shuffles[3] = lp_build_const_int32(gallivm, 7);
671 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672 LLVMConstVector(shuffles, 4), "");
673
674 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
675 }
676
677
678 /*
679 * partially horizontally add 2-4 float vectors with length nx4,
680 * i.e. only four adjacent values in each vector will be added,
681 * assuming values are really grouped in 4 which also determines
682 * output order.
683 *
684 * Return a vector of the same length as the initial vectors,
685 * with the excess elements (if any) being undefined.
686 * The element order is independent of number of input vectors.
687 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
688 * the output order thus will be
689 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
690 */
691 LLVMValueRef
692 lp_build_hadd_partial4(struct lp_build_context *bld,
693 LLVMValueRef vectors[],
694 unsigned num_vecs)
695 {
696 struct gallivm_state *gallivm = bld->gallivm;
697 LLVMBuilderRef builder = gallivm->builder;
698 LLVMValueRef ret_vec;
699 LLVMValueRef tmp[4];
700 const char *intrinsic = NULL;
701
702 assert(num_vecs >= 2 && num_vecs <= 4);
703 assert(bld->type.floating);
704
705 /* only use this with at least 2 vectors, as it is sort of expensive
706 * (depending on cpu) and we always need two horizontal adds anyway,
707 * so a shuffle/add approach might be better.
708 */
709
710 tmp[0] = vectors[0];
711 tmp[1] = vectors[1];
712
713 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
714 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
715
716 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
717 bld->type.length == 4) {
718 intrinsic = "llvm.x86.sse3.hadd.ps";
719 }
720 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
721 bld->type.length == 8) {
722 intrinsic = "llvm.x86.avx.hadd.ps.256";
723 }
724 if (intrinsic) {
725 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
726 lp_build_vec_type(gallivm, bld->type),
727 tmp[0], tmp[1]);
728 if (num_vecs > 2) {
729 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
730 lp_build_vec_type(gallivm, bld->type),
731 tmp[2], tmp[3]);
732 }
733 else {
734 tmp[1] = tmp[0];
735 }
736 return lp_build_intrinsic_binary(builder, intrinsic,
737 lp_build_vec_type(gallivm, bld->type),
738 tmp[0], tmp[1]);
739 }
740
741 if (bld->type.length == 4) {
742 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
743 }
744 else {
745 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
746 unsigned j;
747 unsigned num_iter = bld->type.length / 4;
748 struct lp_type parttype = bld->type;
749 parttype.length = 4;
750 for (j = 0; j < num_iter; j++) {
751 LLVMValueRef partsrc[4];
752 unsigned i;
753 for (i = 0; i < 4; i++) {
754 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
755 }
756 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
757 }
758 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
759 }
760 return ret_vec;
761 }
762
763 /**
764 * Generate a - b
765 */
766 LLVMValueRef
767 lp_build_sub(struct lp_build_context *bld,
768 LLVMValueRef a,
769 LLVMValueRef b)
770 {
771 LLVMBuilderRef builder = bld->gallivm->builder;
772 const struct lp_type type = bld->type;
773 LLVMValueRef res;
774
775 assert(lp_check_value(type, a));
776 assert(lp_check_value(type, b));
777
778 if (b == bld->zero)
779 return a;
780 if (a == bld->undef || b == bld->undef)
781 return bld->undef;
782 if (a == b)
783 return bld->zero;
784
785 if (type.norm) {
786 const char *intrinsic = NULL;
787
788 if (!type.sign && b == bld->one)
789 return bld->zero;
790
791 if (!type.floating && !type.fixed) {
792 if (LLVM_VERSION_MAJOR >= 8) {
793 char intrin[32];
794 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
795 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
796 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
797 }
798 if (type.width * type.length == 128) {
799 if (util_cpu_caps.has_sse2) {
800 if (type.width == 8)
801 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
802 if (type.width == 16)
803 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
804 } else if (util_cpu_caps.has_altivec) {
805 if (type.width == 8)
806 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
807 if (type.width == 16)
808 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
809 }
810 }
811 if (type.width * type.length == 256) {
812 if (util_cpu_caps.has_avx2) {
813 if (type.width == 8)
814 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
815 if (type.width == 16)
816 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
817 }
818 }
819 }
820
821 if (intrinsic)
822 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
823 }
824
825 if(type.norm && !type.floating && !type.fixed) {
826 if (type.sign) {
827 uint64_t sign = (uint64_t)1 << (type.width - 1);
828 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
829 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
830 /* a_clamp_max is the maximum a for negative b,
831 a_clamp_min is the minimum a for positive b. */
832 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
833 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
834 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
835 } else {
836 /*
837 * This must match llvm pattern for saturated unsigned sub.
838 * (lp_build_max_simple actually does the job with its current
839 * definition but do it explicitly here.)
840 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
841 * interfere with llvm's ability to recognize the pattern but seems
842 * a bit brittle.
843 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
844 */
845 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
846 a = lp_build_select(bld, no_ov, a, b);
847 }
848 }
849
850 if(LLVMIsConstant(a) && LLVMIsConstant(b))
851 if (type.floating)
852 res = LLVMConstFSub(a, b);
853 else
854 res = LLVMConstSub(a, b);
855 else
856 if (type.floating)
857 res = LLVMBuildFSub(builder, a, b, "");
858 else
859 res = LLVMBuildSub(builder, a, b, "");
860
861 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
862 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
863
864 return res;
865 }
866
867
868
869 /**
870 * Normalized multiplication.
871 *
872 * There are several approaches for (using 8-bit normalized multiplication as
873 * an example):
874 *
875 * - alpha plus one
876 *
877 * makes the following approximation to the division (Sree)
878 *
879 * a*b/255 ~= (a*(b + 1)) >> 256
880 *
881 * which is the fastest method that satisfies the following OpenGL criteria of
882 *
883 * 0*0 = 0 and 255*255 = 255
884 *
885 * - geometric series
886 *
887 * takes the geometric series approximation to the division
888 *
889 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
890 *
891 * in this case just the first two terms to fit in 16bit arithmetic
892 *
893 * t/255 ~= (t + (t >> 8)) >> 8
894 *
895 * note that just by itself it doesn't satisfies the OpenGL criteria, as
896 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
897 * must be used.
898 *
899 * - geometric series plus rounding
900 *
901 * when using a geometric series division instead of truncating the result
902 * use roundoff in the approximation (Jim Blinn)
903 *
904 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
905 *
906 * achieving the exact results.
907 *
908 *
909 *
910 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
911 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
912 * @sa Michael Herf, The "double blend trick", May 2000,
913 * http://www.stereopsis.com/doubleblend.html
914 */
915 LLVMValueRef
916 lp_build_mul_norm(struct gallivm_state *gallivm,
917 struct lp_type wide_type,
918 LLVMValueRef a, LLVMValueRef b)
919 {
920 LLVMBuilderRef builder = gallivm->builder;
921 struct lp_build_context bld;
922 unsigned n;
923 LLVMValueRef half;
924 LLVMValueRef ab;
925
926 assert(!wide_type.floating);
927 assert(lp_check_value(wide_type, a));
928 assert(lp_check_value(wide_type, b));
929
930 lp_build_context_init(&bld, gallivm, wide_type);
931
932 n = wide_type.width / 2;
933 if (wide_type.sign) {
934 --n;
935 }
936
937 /*
938 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
939 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
940 */
941
942 /*
943 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
944 */
945
946 ab = LLVMBuildMul(builder, a, b, "");
947 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
948
949 /*
950 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
951 */
952
953 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
954 if (wide_type.sign) {
955 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
956 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
957 half = lp_build_select(&bld, sign, minus_half, half);
958 }
959 ab = LLVMBuildAdd(builder, ab, half, "");
960
961 /* Final division */
962 ab = lp_build_shr_imm(&bld, ab, n);
963
964 return ab;
965 }
966
967 /**
968 * Generate a * b
969 */
970 LLVMValueRef
971 lp_build_mul(struct lp_build_context *bld,
972 LLVMValueRef a,
973 LLVMValueRef b)
974 {
975 LLVMBuilderRef builder = bld->gallivm->builder;
976 const struct lp_type type = bld->type;
977 LLVMValueRef shift;
978 LLVMValueRef res;
979
980 assert(lp_check_value(type, a));
981 assert(lp_check_value(type, b));
982
983 if(a == bld->zero)
984 return bld->zero;
985 if(a == bld->one)
986 return b;
987 if(b == bld->zero)
988 return bld->zero;
989 if(b == bld->one)
990 return a;
991 if(a == bld->undef || b == bld->undef)
992 return bld->undef;
993
994 if (!type.floating && !type.fixed && type.norm) {
995 struct lp_type wide_type = lp_wider_type(type);
996 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
997
998 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
999 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1000
1001 /* PMULLW, PSRLW, PADDW */
1002 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1003 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1004
1005 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1006
1007 return ab;
1008 }
1009
1010 if(type.fixed)
1011 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1012 else
1013 shift = NULL;
1014
1015 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1016 if (type.floating)
1017 res = LLVMConstFMul(a, b);
1018 else
1019 res = LLVMConstMul(a, b);
1020 if(shift) {
1021 if(type.sign)
1022 res = LLVMConstAShr(res, shift);
1023 else
1024 res = LLVMConstLShr(res, shift);
1025 }
1026 }
1027 else {
1028 if (type.floating)
1029 res = LLVMBuildFMul(builder, a, b, "");
1030 else
1031 res = LLVMBuildMul(builder, a, b, "");
1032 if(shift) {
1033 if(type.sign)
1034 res = LLVMBuildAShr(builder, res, shift, "");
1035 else
1036 res = LLVMBuildLShr(builder, res, shift, "");
1037 }
1038 }
1039
1040 return res;
1041 }
1042
1043 /*
1044 * Widening mul, valid for 32x32 bit -> 64bit only.
1045 * Result is low 32bits, high bits returned in res_hi.
1046 *
1047 * Emits code that is meant to be compiled for the host CPU.
1048 */
1049 LLVMValueRef
1050 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1051 LLVMValueRef a,
1052 LLVMValueRef b,
1053 LLVMValueRef *res_hi)
1054 {
1055 struct gallivm_state *gallivm = bld->gallivm;
1056 LLVMBuilderRef builder = gallivm->builder;
1057
1058 assert(bld->type.width == 32);
1059 assert(bld->type.floating == 0);
1060 assert(bld->type.fixed == 0);
1061 assert(bld->type.norm == 0);
1062
1063 /*
1064 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1065 * for x86 simd is atrocious (even if the high bits weren't required),
1066 * trying to handle real 64bit inputs (which of course can't happen due
1067 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1068 * apparently llvm does not recognize this widening mul). This includes 6
1069 * (instead of 2) pmuludq plus extra adds and shifts
1070 * The same story applies to signed mul, albeit fixing this requires sse41.
1071 * https://llvm.org/bugs/show_bug.cgi?id=30845
1072 * So, whip up our own code, albeit only for length 4 and 8 (which
1073 * should be good enough)...
1074 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1075 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1076 * for signed), which the fallback code does not, without this llvm
1077 * will likely still produce atrocious code.
1078 */
1079 if (LLVM_VERSION_MAJOR < 7 &&
1080 (bld->type.length == 4 || bld->type.length == 8) &&
1081 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1082 util_cpu_caps.has_sse4_1)) {
1083 const char *intrinsic = NULL;
1084 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1085 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1086 struct lp_type type_wide = lp_wider_type(bld->type);
1087 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1088 unsigned i;
1089 for (i = 0; i < bld->type.length; i += 2) {
1090 shuf[i] = lp_build_const_int32(gallivm, i+1);
1091 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1092 }
1093 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1094 aeven = a;
1095 beven = b;
1096 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1097 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1098
1099 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1100 if (bld->type.sign) {
1101 intrinsic = "llvm.x86.avx2.pmul.dq";
1102 } else {
1103 intrinsic = "llvm.x86.avx2.pmulu.dq";
1104 }
1105 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1106 wider_type, aeven, beven);
1107 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1108 wider_type, aodd, bodd);
1109 }
1110 else {
1111 /* for consistent naming look elsewhere... */
1112 if (bld->type.sign) {
1113 intrinsic = "llvm.x86.sse41.pmuldq";
1114 } else {
1115 intrinsic = "llvm.x86.sse2.pmulu.dq";
1116 }
1117 /*
1118 * XXX If we only have AVX but not AVX2 this is a pain.
1119 * lp_build_intrinsic_binary_anylength() can't handle it
1120 * (due to src and dst type not being identical).
1121 */
1122 if (bld->type.length == 8) {
1123 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1124 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1125 LLVMValueRef muleven2[2], mulodd2[2];
1126 struct lp_type type_wide_half = type_wide;
1127 LLVMTypeRef wtype_half;
1128 type_wide_half.length = 2;
1129 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1130 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1131 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1132 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1133 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1134 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1135 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1136 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1137 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1138 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1139 wtype_half, aevenlo, bevenlo);
1140 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1141 wtype_half, aoddlo, boddlo);
1142 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1143 wtype_half, aevenhi, bevenhi);
1144 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1145 wtype_half, aoddhi, boddhi);
1146 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1147 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1148
1149 }
1150 else {
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 }
1157 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1158 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1159
1160 for (i = 0; i < bld->type.length; i += 2) {
1161 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1162 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1163 }
1164 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1165 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1166
1167 for (i = 0; i < bld->type.length; i += 2) {
1168 shuf[i] = lp_build_const_int32(gallivm, i);
1169 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1170 }
1171 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1172 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1173 }
1174 else {
1175 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1176 }
1177 }
1178
1179
1180 /*
1181 * Widening mul, valid for 32x32 bit -> 64bit only.
1182 * Result is low 32bits, high bits returned in res_hi.
1183 *
1184 * Emits generic code.
1185 */
1186 LLVMValueRef
1187 lp_build_mul_32_lohi(struct lp_build_context *bld,
1188 LLVMValueRef a,
1189 LLVMValueRef b,
1190 LLVMValueRef *res_hi)
1191 {
1192 struct gallivm_state *gallivm = bld->gallivm;
1193 LLVMBuilderRef builder = gallivm->builder;
1194 LLVMValueRef tmp, shift, res_lo;
1195 struct lp_type type_tmp;
1196 LLVMTypeRef wide_type, narrow_type;
1197
1198 type_tmp = bld->type;
1199 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1200 type_tmp.width *= 2;
1201 wide_type = lp_build_vec_type(gallivm, type_tmp);
1202 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1203
1204 if (bld->type.sign) {
1205 a = LLVMBuildSExt(builder, a, wide_type, "");
1206 b = LLVMBuildSExt(builder, b, wide_type, "");
1207 } else {
1208 a = LLVMBuildZExt(builder, a, wide_type, "");
1209 b = LLVMBuildZExt(builder, b, wide_type, "");
1210 }
1211 tmp = LLVMBuildMul(builder, a, b, "");
1212
1213 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1214
1215 /* Since we truncate anyway, LShr and AShr are equivalent. */
1216 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1217 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1218
1219 return res_lo;
1220 }
1221
1222
1223 /* a * b + c */
1224 LLVMValueRef
1225 lp_build_mad(struct lp_build_context *bld,
1226 LLVMValueRef a,
1227 LLVMValueRef b,
1228 LLVMValueRef c)
1229 {
1230 const struct lp_type type = bld->type;
1231 if (type.floating) {
1232 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1233 } else {
1234 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1235 }
1236 }
1237
1238
1239 /**
1240 * Small vector x scale multiplication optimization.
1241 */
1242 LLVMValueRef
1243 lp_build_mul_imm(struct lp_build_context *bld,
1244 LLVMValueRef a,
1245 int b)
1246 {
1247 LLVMBuilderRef builder = bld->gallivm->builder;
1248 LLVMValueRef factor;
1249
1250 assert(lp_check_value(bld->type, a));
1251
1252 if(b == 0)
1253 return bld->zero;
1254
1255 if(b == 1)
1256 return a;
1257
1258 if(b == -1)
1259 return lp_build_negate(bld, a);
1260
1261 if(b == 2 && bld->type.floating)
1262 return lp_build_add(bld, a, a);
1263
1264 if(util_is_power_of_two_or_zero(b)) {
1265 unsigned shift = ffs(b) - 1;
1266
1267 if(bld->type.floating) {
1268 #if 0
1269 /*
1270 * Power of two multiplication by directly manipulating the exponent.
1271 *
1272 * XXX: This might not be always faster, it will introduce a small error
1273 * for multiplication by zero, and it will produce wrong results
1274 * for Inf and NaN.
1275 */
1276 unsigned mantissa = lp_mantissa(bld->type);
1277 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1278 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1279 a = LLVMBuildAdd(builder, a, factor, "");
1280 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1281 return a;
1282 #endif
1283 }
1284 else {
1285 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1286 return LLVMBuildShl(builder, a, factor, "");
1287 }
1288 }
1289
1290 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1291 return lp_build_mul(bld, a, factor);
1292 }
1293
1294
1295 /**
1296 * Generate a / b
1297 */
1298 LLVMValueRef
1299 lp_build_div(struct lp_build_context *bld,
1300 LLVMValueRef a,
1301 LLVMValueRef b)
1302 {
1303 LLVMBuilderRef builder = bld->gallivm->builder;
1304 const struct lp_type type = bld->type;
1305
1306 assert(lp_check_value(type, a));
1307 assert(lp_check_value(type, b));
1308
1309 if(a == bld->zero)
1310 return bld->zero;
1311 if(a == bld->one && type.floating)
1312 return lp_build_rcp(bld, b);
1313 if(b == bld->zero)
1314 return bld->undef;
1315 if(b == bld->one)
1316 return a;
1317 if(a == bld->undef || b == bld->undef)
1318 return bld->undef;
1319
1320 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1321 if (type.floating)
1322 return LLVMConstFDiv(a, b);
1323 else if (type.sign)
1324 return LLVMConstSDiv(a, b);
1325 else
1326 return LLVMConstUDiv(a, b);
1327 }
1328
1329 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1330 if(FALSE &&
1331 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1332 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1333 type.floating)
1334 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1335
1336 if (type.floating)
1337 return LLVMBuildFDiv(builder, a, b, "");
1338 else if (type.sign)
1339 return LLVMBuildSDiv(builder, a, b, "");
1340 else
1341 return LLVMBuildUDiv(builder, a, b, "");
1342 }
1343
1344
1345 /**
1346 * Linear interpolation helper.
1347 *
1348 * @param normalized whether we are interpolating normalized values,
1349 * encoded in normalized integers, twice as wide.
1350 *
1351 * @sa http://www.stereopsis.com/doubleblend.html
1352 */
1353 static inline LLVMValueRef
1354 lp_build_lerp_simple(struct lp_build_context *bld,
1355 LLVMValueRef x,
1356 LLVMValueRef v0,
1357 LLVMValueRef v1,
1358 unsigned flags)
1359 {
1360 unsigned half_width = bld->type.width/2;
1361 LLVMBuilderRef builder = bld->gallivm->builder;
1362 LLVMValueRef delta;
1363 LLVMValueRef res;
1364
1365 assert(lp_check_value(bld->type, x));
1366 assert(lp_check_value(bld->type, v0));
1367 assert(lp_check_value(bld->type, v1));
1368
1369 delta = lp_build_sub(bld, v1, v0);
1370
1371 if (bld->type.floating) {
1372 assert(flags == 0);
1373 return lp_build_mad(bld, x, delta, v0);
1374 }
1375
1376 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1377 if (!bld->type.sign) {
1378 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1379 /*
1380 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1381 * most-significant-bit to the lowest-significant-bit, so that
1382 * later we can just divide by 2**n instead of 2**n - 1.
1383 */
1384
1385 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1386 }
1387
1388 /* (x * delta) >> n */
1389 res = lp_build_mul(bld, x, delta);
1390 res = lp_build_shr_imm(bld, res, half_width);
1391 } else {
1392 /*
1393 * The rescaling trick above doesn't work for signed numbers, so
1394 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1395 * instead.
1396 */
1397 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1398 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1399 }
1400 } else {
1401 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1402 res = lp_build_mul(bld, x, delta);
1403 }
1404
1405 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1406 /*
1407 * At this point both res and v0 only use the lower half of the bits,
1408 * the rest is zero. Instead of add / mask, do add with half wide type.
1409 */
1410 struct lp_type narrow_type;
1411 struct lp_build_context narrow_bld;
1412
1413 memset(&narrow_type, 0, sizeof narrow_type);
1414 narrow_type.sign = bld->type.sign;
1415 narrow_type.width = bld->type.width/2;
1416 narrow_type.length = bld->type.length*2;
1417
1418 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1419 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1420 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1421 res = lp_build_add(&narrow_bld, v0, res);
1422 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1423 } else {
1424 res = lp_build_add(bld, v0, res);
1425
1426 if (bld->type.fixed) {
1427 /*
1428 * We need to mask out the high order bits when lerping 8bit
1429 * normalized colors stored on 16bits
1430 */
1431 /* XXX: This step is necessary for lerping 8bit colors stored on
1432 * 16bits, but it will be wrong for true fixed point use cases.
1433 * Basically we need a more powerful lp_type, capable of further
1434 * distinguishing the values interpretation from the value storage.
1435 */
1436 LLVMValueRef low_bits;
1437 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1438 res = LLVMBuildAnd(builder, res, low_bits, "");
1439 }
1440 }
1441
1442 return res;
1443 }
1444
1445
1446 /**
1447 * Linear interpolation.
1448 */
1449 LLVMValueRef
1450 lp_build_lerp(struct lp_build_context *bld,
1451 LLVMValueRef x,
1452 LLVMValueRef v0,
1453 LLVMValueRef v1,
1454 unsigned flags)
1455 {
1456 const struct lp_type type = bld->type;
1457 LLVMValueRef res;
1458
1459 assert(lp_check_value(type, x));
1460 assert(lp_check_value(type, v0));
1461 assert(lp_check_value(type, v1));
1462
1463 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1464
1465 if (type.norm) {
1466 struct lp_type wide_type;
1467 struct lp_build_context wide_bld;
1468 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1469
1470 assert(type.length >= 2);
1471
1472 /*
1473 * Create a wider integer type, enough to hold the
1474 * intermediate result of the multiplication.
1475 */
1476 memset(&wide_type, 0, sizeof wide_type);
1477 wide_type.sign = type.sign;
1478 wide_type.width = type.width*2;
1479 wide_type.length = type.length/2;
1480
1481 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1482
1483 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1484 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1485 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1486
1487 /*
1488 * Lerp both halves.
1489 */
1490
1491 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1492
1493 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1494 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1495
1496 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1497 } else {
1498 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1499 }
1500
1501 return res;
1502 }
1503
1504
1505 /**
1506 * Bilinear interpolation.
1507 *
1508 * Values indices are in v_{yx}.
1509 */
1510 LLVMValueRef
1511 lp_build_lerp_2d(struct lp_build_context *bld,
1512 LLVMValueRef x,
1513 LLVMValueRef y,
1514 LLVMValueRef v00,
1515 LLVMValueRef v01,
1516 LLVMValueRef v10,
1517 LLVMValueRef v11,
1518 unsigned flags)
1519 {
1520 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1521 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1522 return lp_build_lerp(bld, y, v0, v1, flags);
1523 }
1524
1525
1526 LLVMValueRef
1527 lp_build_lerp_3d(struct lp_build_context *bld,
1528 LLVMValueRef x,
1529 LLVMValueRef y,
1530 LLVMValueRef z,
1531 LLVMValueRef v000,
1532 LLVMValueRef v001,
1533 LLVMValueRef v010,
1534 LLVMValueRef v011,
1535 LLVMValueRef v100,
1536 LLVMValueRef v101,
1537 LLVMValueRef v110,
1538 LLVMValueRef v111,
1539 unsigned flags)
1540 {
1541 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1542 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1543 return lp_build_lerp(bld, z, v0, v1, flags);
1544 }
1545
1546
1547 /**
1548 * Generate min(a, b)
1549 * Do checks for special cases but not for nans.
1550 */
1551 LLVMValueRef
1552 lp_build_min(struct lp_build_context *bld,
1553 LLVMValueRef a,
1554 LLVMValueRef b)
1555 {
1556 assert(lp_check_value(bld->type, a));
1557 assert(lp_check_value(bld->type, b));
1558
1559 if(a == bld->undef || b == bld->undef)
1560 return bld->undef;
1561
1562 if(a == b)
1563 return a;
1564
1565 if (bld->type.norm) {
1566 if (!bld->type.sign) {
1567 if (a == bld->zero || b == bld->zero) {
1568 return bld->zero;
1569 }
1570 }
1571 if(a == bld->one)
1572 return b;
1573 if(b == bld->one)
1574 return a;
1575 }
1576
1577 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1578 }
1579
1580
1581 /**
1582 * Generate min(a, b)
1583 * NaN's are handled according to the behavior specified by the
1584 * nan_behavior argument.
1585 */
1586 LLVMValueRef
1587 lp_build_min_ext(struct lp_build_context *bld,
1588 LLVMValueRef a,
1589 LLVMValueRef b,
1590 enum gallivm_nan_behavior nan_behavior)
1591 {
1592 assert(lp_check_value(bld->type, a));
1593 assert(lp_check_value(bld->type, b));
1594
1595 if(a == bld->undef || b == bld->undef)
1596 return bld->undef;
1597
1598 if(a == b)
1599 return a;
1600
1601 if (bld->type.norm) {
1602 if (!bld->type.sign) {
1603 if (a == bld->zero || b == bld->zero) {
1604 return bld->zero;
1605 }
1606 }
1607 if(a == bld->one)
1608 return b;
1609 if(b == bld->one)
1610 return a;
1611 }
1612
1613 return lp_build_min_simple(bld, a, b, nan_behavior);
1614 }
1615
1616 /**
1617 * Generate max(a, b)
1618 * Do checks for special cases, but NaN behavior is undefined.
1619 */
1620 LLVMValueRef
1621 lp_build_max(struct lp_build_context *bld,
1622 LLVMValueRef a,
1623 LLVMValueRef b)
1624 {
1625 assert(lp_check_value(bld->type, a));
1626 assert(lp_check_value(bld->type, b));
1627
1628 if(a == bld->undef || b == bld->undef)
1629 return bld->undef;
1630
1631 if(a == b)
1632 return a;
1633
1634 if(bld->type.norm) {
1635 if(a == bld->one || b == bld->one)
1636 return bld->one;
1637 if (!bld->type.sign) {
1638 if (a == bld->zero) {
1639 return b;
1640 }
1641 if (b == bld->zero) {
1642 return a;
1643 }
1644 }
1645 }
1646
1647 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1648 }
1649
1650
1651 /**
1652 * Generate max(a, b)
1653 * Checks for special cases.
1654 * NaN's are handled according to the behavior specified by the
1655 * nan_behavior argument.
1656 */
1657 LLVMValueRef
1658 lp_build_max_ext(struct lp_build_context *bld,
1659 LLVMValueRef a,
1660 LLVMValueRef b,
1661 enum gallivm_nan_behavior nan_behavior)
1662 {
1663 assert(lp_check_value(bld->type, a));
1664 assert(lp_check_value(bld->type, b));
1665
1666 if(a == bld->undef || b == bld->undef)
1667 return bld->undef;
1668
1669 if(a == b)
1670 return a;
1671
1672 if(bld->type.norm) {
1673 if(a == bld->one || b == bld->one)
1674 return bld->one;
1675 if (!bld->type.sign) {
1676 if (a == bld->zero) {
1677 return b;
1678 }
1679 if (b == bld->zero) {
1680 return a;
1681 }
1682 }
1683 }
1684
1685 return lp_build_max_simple(bld, a, b, nan_behavior);
1686 }
1687
1688 /**
1689 * Generate clamp(a, min, max)
1690 * NaN behavior (for any of a, min, max) is undefined.
1691 * Do checks for special cases.
1692 */
1693 LLVMValueRef
1694 lp_build_clamp(struct lp_build_context *bld,
1695 LLVMValueRef a,
1696 LLVMValueRef min,
1697 LLVMValueRef max)
1698 {
1699 assert(lp_check_value(bld->type, a));
1700 assert(lp_check_value(bld->type, min));
1701 assert(lp_check_value(bld->type, max));
1702
1703 a = lp_build_min(bld, a, max);
1704 a = lp_build_max(bld, a, min);
1705 return a;
1706 }
1707
1708
1709 /**
1710 * Generate clamp(a, 0, 1)
1711 * A NaN will get converted to zero.
1712 */
1713 LLVMValueRef
1714 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1715 LLVMValueRef a)
1716 {
1717 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1718 a = lp_build_min(bld, a, bld->one);
1719 return a;
1720 }
1721
1722
1723 /**
1724 * Generate abs(a)
1725 */
1726 LLVMValueRef
1727 lp_build_abs(struct lp_build_context *bld,
1728 LLVMValueRef a)
1729 {
1730 LLVMBuilderRef builder = bld->gallivm->builder;
1731 const struct lp_type type = bld->type;
1732 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1733
1734 assert(lp_check_value(type, a));
1735
1736 if(!type.sign)
1737 return a;
1738
1739 if(type.floating) {
1740 char intrinsic[32];
1741 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1742 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1743 }
1744
1745 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1746 switch(type.width) {
1747 case 8:
1748 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1749 case 16:
1750 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1751 case 32:
1752 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1753 }
1754 }
1755 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1756 switch(type.width) {
1757 case 8:
1758 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1759 case 16:
1760 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1761 case 32:
1762 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1763 }
1764 }
1765
1766 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1767 a, LLVMBuildNeg(builder, a, ""));
1768 }
1769
1770
1771 LLVMValueRef
1772 lp_build_negate(struct lp_build_context *bld,
1773 LLVMValueRef a)
1774 {
1775 LLVMBuilderRef builder = bld->gallivm->builder;
1776
1777 assert(lp_check_value(bld->type, a));
1778
1779 if (bld->type.floating)
1780 a = LLVMBuildFNeg(builder, a, "");
1781 else
1782 a = LLVMBuildNeg(builder, a, "");
1783
1784 return a;
1785 }
1786
1787
1788 /** Return -1, 0 or +1 depending on the sign of a */
1789 LLVMValueRef
1790 lp_build_sgn(struct lp_build_context *bld,
1791 LLVMValueRef a)
1792 {
1793 LLVMBuilderRef builder = bld->gallivm->builder;
1794 const struct lp_type type = bld->type;
1795 LLVMValueRef cond;
1796 LLVMValueRef res;
1797
1798 assert(lp_check_value(type, a));
1799
1800 /* Handle non-zero case */
1801 if(!type.sign) {
1802 /* if not zero then sign must be positive */
1803 res = bld->one;
1804 }
1805 else if(type.floating) {
1806 LLVMTypeRef vec_type;
1807 LLVMTypeRef int_type;
1808 LLVMValueRef mask;
1809 LLVMValueRef sign;
1810 LLVMValueRef one;
1811 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1812
1813 int_type = lp_build_int_vec_type(bld->gallivm, type);
1814 vec_type = lp_build_vec_type(bld->gallivm, type);
1815 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1816
1817 /* Take the sign bit and add it to 1 constant */
1818 sign = LLVMBuildBitCast(builder, a, int_type, "");
1819 sign = LLVMBuildAnd(builder, sign, mask, "");
1820 one = LLVMConstBitCast(bld->one, int_type);
1821 res = LLVMBuildOr(builder, sign, one, "");
1822 res = LLVMBuildBitCast(builder, res, vec_type, "");
1823 }
1824 else
1825 {
1826 /* signed int/norm/fixed point */
1827 /* could use psign with sse3 and appropriate vectors here */
1828 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1829 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1830 res = lp_build_select(bld, cond, bld->one, minus_one);
1831 }
1832
1833 /* Handle zero */
1834 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1835 res = lp_build_select(bld, cond, bld->zero, res);
1836
1837 return res;
1838 }
1839
1840
1841 /**
1842 * Set the sign of float vector 'a' according to 'sign'.
1843 * If sign==0, return abs(a).
1844 * If sign==1, return -abs(a);
1845 * Other values for sign produce undefined results.
1846 */
1847 LLVMValueRef
1848 lp_build_set_sign(struct lp_build_context *bld,
1849 LLVMValueRef a, LLVMValueRef sign)
1850 {
1851 LLVMBuilderRef builder = bld->gallivm->builder;
1852 const struct lp_type type = bld->type;
1853 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1854 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1855 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1856 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1857 ~((unsigned long long) 1 << (type.width - 1)));
1858 LLVMValueRef val, res;
1859
1860 assert(type.floating);
1861 assert(lp_check_value(type, a));
1862
1863 /* val = reinterpret_cast<int>(a) */
1864 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1865 /* val = val & mask */
1866 val = LLVMBuildAnd(builder, val, mask, "");
1867 /* sign = sign << shift */
1868 sign = LLVMBuildShl(builder, sign, shift, "");
1869 /* res = val | sign */
1870 res = LLVMBuildOr(builder, val, sign, "");
1871 /* res = reinterpret_cast<float>(res) */
1872 res = LLVMBuildBitCast(builder, res, vec_type, "");
1873
1874 return res;
1875 }
1876
1877
1878 /**
1879 * Convert vector of (or scalar) int to vector of (or scalar) float.
1880 */
1881 LLVMValueRef
1882 lp_build_int_to_float(struct lp_build_context *bld,
1883 LLVMValueRef a)
1884 {
1885 LLVMBuilderRef builder = bld->gallivm->builder;
1886 const struct lp_type type = bld->type;
1887 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1888
1889 assert(type.floating);
1890
1891 return LLVMBuildSIToFP(builder, a, vec_type, "");
1892 }
1893
1894 static boolean
1895 arch_rounding_available(const struct lp_type type)
1896 {
1897 if ((util_cpu_caps.has_sse4_1 &&
1898 (type.length == 1 || type.width*type.length == 128)) ||
1899 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1900 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1901 return TRUE;
1902 else if ((util_cpu_caps.has_altivec &&
1903 (type.width == 32 && type.length == 4)))
1904 return TRUE;
1905 else if (util_cpu_caps.has_neon)
1906 return TRUE;
1907
1908 return FALSE;
1909 }
1910
1911 enum lp_build_round_mode
1912 {
1913 LP_BUILD_ROUND_NEAREST = 0,
1914 LP_BUILD_ROUND_FLOOR = 1,
1915 LP_BUILD_ROUND_CEIL = 2,
1916 LP_BUILD_ROUND_TRUNCATE = 3
1917 };
1918
1919 static inline LLVMValueRef
1920 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1921 LLVMValueRef a)
1922 {
1923 LLVMBuilderRef builder = bld->gallivm->builder;
1924 const struct lp_type type = bld->type;
1925 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1926 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1927 const char *intrinsic;
1928 LLVMValueRef res;
1929
1930 assert(type.floating);
1931 /* using the double precision conversions is a bit more complicated */
1932 assert(type.width == 32);
1933
1934 assert(lp_check_value(type, a));
1935 assert(util_cpu_caps.has_sse2);
1936
1937 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1938 if (type.length == 1) {
1939 LLVMTypeRef vec_type;
1940 LLVMValueRef undef;
1941 LLVMValueRef arg;
1942 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1943
1944 vec_type = LLVMVectorType(bld->elem_type, 4);
1945
1946 intrinsic = "llvm.x86.sse.cvtss2si";
1947
1948 undef = LLVMGetUndef(vec_type);
1949
1950 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1951
1952 res = lp_build_intrinsic_unary(builder, intrinsic,
1953 ret_type, arg);
1954 }
1955 else {
1956 if (type.width* type.length == 128) {
1957 intrinsic = "llvm.x86.sse2.cvtps2dq";
1958 }
1959 else {
1960 assert(type.width*type.length == 256);
1961 assert(util_cpu_caps.has_avx);
1962
1963 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1964 }
1965 res = lp_build_intrinsic_unary(builder, intrinsic,
1966 ret_type, a);
1967 }
1968
1969 return res;
1970 }
1971
1972
1973 /*
1974 */
1975 static inline LLVMValueRef
1976 lp_build_round_altivec(struct lp_build_context *bld,
1977 LLVMValueRef a,
1978 enum lp_build_round_mode mode)
1979 {
1980 LLVMBuilderRef builder = bld->gallivm->builder;
1981 const struct lp_type type = bld->type;
1982 const char *intrinsic = NULL;
1983
1984 assert(type.floating);
1985
1986 assert(lp_check_value(type, a));
1987 assert(util_cpu_caps.has_altivec);
1988
1989 (void)type;
1990
1991 switch (mode) {
1992 case LP_BUILD_ROUND_NEAREST:
1993 intrinsic = "llvm.ppc.altivec.vrfin";
1994 break;
1995 case LP_BUILD_ROUND_FLOOR:
1996 intrinsic = "llvm.ppc.altivec.vrfim";
1997 break;
1998 case LP_BUILD_ROUND_CEIL:
1999 intrinsic = "llvm.ppc.altivec.vrfip";
2000 break;
2001 case LP_BUILD_ROUND_TRUNCATE:
2002 intrinsic = "llvm.ppc.altivec.vrfiz";
2003 break;
2004 }
2005
2006 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2007 }
2008
2009 static inline LLVMValueRef
2010 lp_build_round_arch(struct lp_build_context *bld,
2011 LLVMValueRef a,
2012 enum lp_build_round_mode mode)
2013 {
2014 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2015 LLVMBuilderRef builder = bld->gallivm->builder;
2016 const struct lp_type type = bld->type;
2017 const char *intrinsic_root;
2018 char intrinsic[32];
2019
2020 assert(type.floating);
2021 assert(lp_check_value(type, a));
2022 (void)type;
2023
2024 switch (mode) {
2025 case LP_BUILD_ROUND_NEAREST:
2026 intrinsic_root = "llvm.nearbyint";
2027 break;
2028 case LP_BUILD_ROUND_FLOOR:
2029 intrinsic_root = "llvm.floor";
2030 break;
2031 case LP_BUILD_ROUND_CEIL:
2032 intrinsic_root = "llvm.ceil";
2033 break;
2034 case LP_BUILD_ROUND_TRUNCATE:
2035 intrinsic_root = "llvm.trunc";
2036 break;
2037 }
2038
2039 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2040 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2041 }
2042 else /* (util_cpu_caps.has_altivec) */
2043 return lp_build_round_altivec(bld, a, mode);
2044 }
2045
2046 /**
2047 * Return the integer part of a float (vector) value (== round toward zero).
2048 * The returned value is a float (vector).
2049 * Ex: trunc(-1.5) = -1.0
2050 */
2051 LLVMValueRef
2052 lp_build_trunc(struct lp_build_context *bld,
2053 LLVMValueRef a)
2054 {
2055 LLVMBuilderRef builder = bld->gallivm->builder;
2056 const struct lp_type type = bld->type;
2057
2058 assert(type.floating);
2059 assert(lp_check_value(type, a));
2060
2061 if (arch_rounding_available(type)) {
2062 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2063 }
2064 else {
2065 const struct lp_type type = bld->type;
2066 struct lp_type inttype;
2067 struct lp_build_context intbld;
2068 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2069 LLVMValueRef trunc, res, anosign, mask;
2070 LLVMTypeRef int_vec_type = bld->int_vec_type;
2071 LLVMTypeRef vec_type = bld->vec_type;
2072
2073 inttype = type;
2074 inttype.floating = 0;
2075 lp_build_context_init(&intbld, bld->gallivm, inttype);
2076
2077 /* round by truncation */
2078 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2079 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2080
2081 /* mask out sign bit */
2082 anosign = lp_build_abs(bld, a);
2083 /*
2084 * mask out all values if anosign > 2^24
2085 * This should work both for large ints (all rounding is no-op for them
2086 * because such floats are always exact) as well as special cases like
2087 * NaNs, Infs (taking advantage of the fact they use max exponent).
2088 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2089 */
2090 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2091 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2092 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2093 return lp_build_select(bld, mask, a, res);
2094 }
2095 }
2096
2097
2098 /**
2099 * Return float (vector) rounded to nearest integer (vector). The returned
2100 * value is a float (vector).
2101 * Ex: round(0.9) = 1.0
2102 * Ex: round(-1.5) = -2.0
2103 */
2104 LLVMValueRef
2105 lp_build_round(struct lp_build_context *bld,
2106 LLVMValueRef a)
2107 {
2108 LLVMBuilderRef builder = bld->gallivm->builder;
2109 const struct lp_type type = bld->type;
2110
2111 assert(type.floating);
2112 assert(lp_check_value(type, a));
2113
2114 if (arch_rounding_available(type)) {
2115 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2116 }
2117 else {
2118 const struct lp_type type = bld->type;
2119 struct lp_type inttype;
2120 struct lp_build_context intbld;
2121 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2122 LLVMValueRef res, anosign, mask;
2123 LLVMTypeRef int_vec_type = bld->int_vec_type;
2124 LLVMTypeRef vec_type = bld->vec_type;
2125
2126 inttype = type;
2127 inttype.floating = 0;
2128 lp_build_context_init(&intbld, bld->gallivm, inttype);
2129
2130 res = lp_build_iround(bld, a);
2131 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2132
2133 /* mask out sign bit */
2134 anosign = lp_build_abs(bld, a);
2135 /*
2136 * mask out all values if anosign > 2^24
2137 * This should work both for large ints (all rounding is no-op for them
2138 * because such floats are always exact) as well as special cases like
2139 * NaNs, Infs (taking advantage of the fact they use max exponent).
2140 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2141 */
2142 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2143 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2144 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2145 return lp_build_select(bld, mask, a, res);
2146 }
2147 }
2148
2149
2150 /**
2151 * Return floor of float (vector), result is a float (vector)
2152 * Ex: floor(1.1) = 1.0
2153 * Ex: floor(-1.1) = -2.0
2154 */
2155 LLVMValueRef
2156 lp_build_floor(struct lp_build_context *bld,
2157 LLVMValueRef a)
2158 {
2159 LLVMBuilderRef builder = bld->gallivm->builder;
2160 const struct lp_type type = bld->type;
2161
2162 assert(type.floating);
2163 assert(lp_check_value(type, a));
2164
2165 if (arch_rounding_available(type)) {
2166 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2167 }
2168 else {
2169 const struct lp_type type = bld->type;
2170 struct lp_type inttype;
2171 struct lp_build_context intbld;
2172 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2173 LLVMValueRef trunc, res, anosign, mask;
2174 LLVMTypeRef int_vec_type = bld->int_vec_type;
2175 LLVMTypeRef vec_type = bld->vec_type;
2176
2177 if (type.width != 32) {
2178 char intrinsic[32];
2179 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2180 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2181 }
2182
2183 assert(type.width == 32); /* might want to handle doubles at some point */
2184
2185 inttype = type;
2186 inttype.floating = 0;
2187 lp_build_context_init(&intbld, bld->gallivm, inttype);
2188
2189 /* round by truncation */
2190 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2191 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2192
2193 if (type.sign) {
2194 LLVMValueRef tmp;
2195
2196 /*
2197 * fix values if rounding is wrong (for non-special cases)
2198 * - this is the case if trunc > a
2199 */
2200 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2201 /* tmp = trunc > a ? 1.0 : 0.0 */
2202 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2203 tmp = lp_build_and(&intbld, mask, tmp);
2204 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2205 res = lp_build_sub(bld, res, tmp);
2206 }
2207
2208 /* mask out sign bit */
2209 anosign = lp_build_abs(bld, a);
2210 /*
2211 * mask out all values if anosign > 2^24
2212 * This should work both for large ints (all rounding is no-op for them
2213 * because such floats are always exact) as well as special cases like
2214 * NaNs, Infs (taking advantage of the fact they use max exponent).
2215 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2216 */
2217 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2218 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2219 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2220 return lp_build_select(bld, mask, a, res);
2221 }
2222 }
2223
2224
2225 /**
2226 * Return ceiling of float (vector), returning float (vector).
2227 * Ex: ceil( 1.1) = 2.0
2228 * Ex: ceil(-1.1) = -1.0
2229 */
2230 LLVMValueRef
2231 lp_build_ceil(struct lp_build_context *bld,
2232 LLVMValueRef a)
2233 {
2234 LLVMBuilderRef builder = bld->gallivm->builder;
2235 const struct lp_type type = bld->type;
2236
2237 assert(type.floating);
2238 assert(lp_check_value(type, a));
2239
2240 if (arch_rounding_available(type)) {
2241 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2242 }
2243 else {
2244 const struct lp_type type = bld->type;
2245 struct lp_type inttype;
2246 struct lp_build_context intbld;
2247 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2248 LLVMValueRef trunc, res, anosign, mask, tmp;
2249 LLVMTypeRef int_vec_type = bld->int_vec_type;
2250 LLVMTypeRef vec_type = bld->vec_type;
2251
2252 if (type.width != 32) {
2253 char intrinsic[32];
2254 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2255 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2256 }
2257
2258 assert(type.width == 32); /* might want to handle doubles at some point */
2259
2260 inttype = type;
2261 inttype.floating = 0;
2262 lp_build_context_init(&intbld, bld->gallivm, inttype);
2263
2264 /* round by truncation */
2265 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2266 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2267
2268 /*
2269 * fix values if rounding is wrong (for non-special cases)
2270 * - this is the case if trunc < a
2271 */
2272 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2273 /* tmp = trunc < a ? 1.0 : 0.0 */
2274 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2275 tmp = lp_build_and(&intbld, mask, tmp);
2276 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2277 res = lp_build_add(bld, trunc, tmp);
2278
2279 /* mask out sign bit */
2280 anosign = lp_build_abs(bld, a);
2281 /*
2282 * mask out all values if anosign > 2^24
2283 * This should work both for large ints (all rounding is no-op for them
2284 * because such floats are always exact) as well as special cases like
2285 * NaNs, Infs (taking advantage of the fact they use max exponent).
2286 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2287 */
2288 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2289 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2290 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2291 return lp_build_select(bld, mask, a, res);
2292 }
2293 }
2294
2295
2296 /**
2297 * Return fractional part of 'a' computed as a - floor(a)
2298 * Typically used in texture coord arithmetic.
2299 */
2300 LLVMValueRef
2301 lp_build_fract(struct lp_build_context *bld,
2302 LLVMValueRef a)
2303 {
2304 assert(bld->type.floating);
2305 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2306 }
2307
2308
2309 /**
2310 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2311 * against 0.99999(9). (Will also return that value for NaNs.)
2312 */
2313 static inline LLVMValueRef
2314 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2315 {
2316 LLVMValueRef max;
2317
2318 /* this is the largest number smaller than 1.0 representable as float */
2319 max = lp_build_const_vec(bld->gallivm, bld->type,
2320 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2321 return lp_build_min_ext(bld, fract, max,
2322 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2323 }
2324
2325
2326 /**
2327 * Same as lp_build_fract, but guarantees that the result is always smaller
2328 * than one. Will also return the smaller-than-one value for infs, NaNs.
2329 */
2330 LLVMValueRef
2331 lp_build_fract_safe(struct lp_build_context *bld,
2332 LLVMValueRef a)
2333 {
2334 return clamp_fract(bld, lp_build_fract(bld, a));
2335 }
2336
2337
2338 /**
2339 * Return the integer part of a float (vector) value (== round toward zero).
2340 * The returned value is an integer (vector).
2341 * Ex: itrunc(-1.5) = -1
2342 */
2343 LLVMValueRef
2344 lp_build_itrunc(struct lp_build_context *bld,
2345 LLVMValueRef a)
2346 {
2347 LLVMBuilderRef builder = bld->gallivm->builder;
2348 const struct lp_type type = bld->type;
2349 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2350
2351 assert(type.floating);
2352 assert(lp_check_value(type, a));
2353
2354 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2355 }
2356
2357
2358 /**
2359 * Return float (vector) rounded to nearest integer (vector). The returned
2360 * value is an integer (vector).
2361 * Ex: iround(0.9) = 1
2362 * Ex: iround(-1.5) = -2
2363 */
2364 LLVMValueRef
2365 lp_build_iround(struct lp_build_context *bld,
2366 LLVMValueRef a)
2367 {
2368 LLVMBuilderRef builder = bld->gallivm->builder;
2369 const struct lp_type type = bld->type;
2370 LLVMTypeRef int_vec_type = bld->int_vec_type;
2371 LLVMValueRef res;
2372
2373 assert(type.floating);
2374
2375 assert(lp_check_value(type, a));
2376
2377 if ((util_cpu_caps.has_sse2 &&
2378 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2379 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2380 return lp_build_iround_nearest_sse2(bld, a);
2381 }
2382 if (arch_rounding_available(type)) {
2383 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2384 }
2385 else {
2386 LLVMValueRef half;
2387
2388 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2389
2390 if (type.sign) {
2391 LLVMTypeRef vec_type = bld->vec_type;
2392 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2393 (unsigned long long)1 << (type.width - 1));
2394 LLVMValueRef sign;
2395
2396 /* get sign bit */
2397 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2398 sign = LLVMBuildAnd(builder, sign, mask, "");
2399
2400 /* sign * 0.5 */
2401 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2402 half = LLVMBuildOr(builder, sign, half, "");
2403 half = LLVMBuildBitCast(builder, half, vec_type, "");
2404 }
2405
2406 res = LLVMBuildFAdd(builder, a, half, "");
2407 }
2408
2409 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2410
2411 return res;
2412 }
2413
2414
2415 /**
2416 * Return floor of float (vector), result is an int (vector)
2417 * Ex: ifloor(1.1) = 1.0
2418 * Ex: ifloor(-1.1) = -2.0
2419 */
2420 LLVMValueRef
2421 lp_build_ifloor(struct lp_build_context *bld,
2422 LLVMValueRef a)
2423 {
2424 LLVMBuilderRef builder = bld->gallivm->builder;
2425 const struct lp_type type = bld->type;
2426 LLVMTypeRef int_vec_type = bld->int_vec_type;
2427 LLVMValueRef res;
2428
2429 assert(type.floating);
2430 assert(lp_check_value(type, a));
2431
2432 res = a;
2433 if (type.sign) {
2434 if (arch_rounding_available(type)) {
2435 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2436 }
2437 else {
2438 struct lp_type inttype;
2439 struct lp_build_context intbld;
2440 LLVMValueRef trunc, itrunc, mask;
2441
2442 assert(type.floating);
2443 assert(lp_check_value(type, a));
2444
2445 inttype = type;
2446 inttype.floating = 0;
2447 lp_build_context_init(&intbld, bld->gallivm, inttype);
2448
2449 /* round by truncation */
2450 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2451 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2452
2453 /*
2454 * fix values if rounding is wrong (for non-special cases)
2455 * - this is the case if trunc > a
2456 * The results of doing this with NaNs, very large values etc.
2457 * are undefined but this seems to be the case anyway.
2458 */
2459 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2460 /* cheapie minus one with mask since the mask is minus one / zero */
2461 return lp_build_add(&intbld, itrunc, mask);
2462 }
2463 }
2464
2465 /* round to nearest (toward zero) */
2466 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2467
2468 return res;
2469 }
2470
2471
2472 /**
2473 * Return ceiling of float (vector), returning int (vector).
2474 * Ex: iceil( 1.1) = 2
2475 * Ex: iceil(-1.1) = -1
2476 */
2477 LLVMValueRef
2478 lp_build_iceil(struct lp_build_context *bld,
2479 LLVMValueRef a)
2480 {
2481 LLVMBuilderRef builder = bld->gallivm->builder;
2482 const struct lp_type type = bld->type;
2483 LLVMTypeRef int_vec_type = bld->int_vec_type;
2484 LLVMValueRef res;
2485
2486 assert(type.floating);
2487 assert(lp_check_value(type, a));
2488
2489 if (arch_rounding_available(type)) {
2490 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2491 }
2492 else {
2493 struct lp_type inttype;
2494 struct lp_build_context intbld;
2495 LLVMValueRef trunc, itrunc, mask;
2496
2497 assert(type.floating);
2498 assert(lp_check_value(type, a));
2499
2500 inttype = type;
2501 inttype.floating = 0;
2502 lp_build_context_init(&intbld, bld->gallivm, inttype);
2503
2504 /* round by truncation */
2505 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2506 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2507
2508 /*
2509 * fix values if rounding is wrong (for non-special cases)
2510 * - this is the case if trunc < a
2511 * The results of doing this with NaNs, very large values etc.
2512 * are undefined but this seems to be the case anyway.
2513 */
2514 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2515 /* cheapie plus one with mask since the mask is minus one / zero */
2516 return lp_build_sub(&intbld, itrunc, mask);
2517 }
2518
2519 /* round to nearest (toward zero) */
2520 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2521
2522 return res;
2523 }
2524
2525
2526 /**
2527 * Combined ifloor() & fract().
2528 *
2529 * Preferred to calling the functions separately, as it will ensure that the
2530 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2531 */
2532 void
2533 lp_build_ifloor_fract(struct lp_build_context *bld,
2534 LLVMValueRef a,
2535 LLVMValueRef *out_ipart,
2536 LLVMValueRef *out_fpart)
2537 {
2538 LLVMBuilderRef builder = bld->gallivm->builder;
2539 const struct lp_type type = bld->type;
2540 LLVMValueRef ipart;
2541
2542 assert(type.floating);
2543 assert(lp_check_value(type, a));
2544
2545 if (arch_rounding_available(type)) {
2546 /*
2547 * floor() is easier.
2548 */
2549
2550 ipart = lp_build_floor(bld, a);
2551 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2552 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2553 }
2554 else {
2555 /*
2556 * ifloor() is easier.
2557 */
2558
2559 *out_ipart = lp_build_ifloor(bld, a);
2560 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2561 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2562 }
2563 }
2564
2565
2566 /**
2567 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2568 * always smaller than one.
2569 */
2570 void
2571 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2572 LLVMValueRef a,
2573 LLVMValueRef *out_ipart,
2574 LLVMValueRef *out_fpart)
2575 {
2576 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2577 *out_fpart = clamp_fract(bld, *out_fpart);
2578 }
2579
2580
2581 LLVMValueRef
2582 lp_build_sqrt(struct lp_build_context *bld,
2583 LLVMValueRef a)
2584 {
2585 LLVMBuilderRef builder = bld->gallivm->builder;
2586 const struct lp_type type = bld->type;
2587 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2588 char intrinsic[32];
2589
2590 assert(lp_check_value(type, a));
2591
2592 assert(type.floating);
2593 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2594
2595 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2596 }
2597
2598
2599 /**
2600 * Do one Newton-Raphson step to improve reciprocate precision:
2601 *
2602 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2603 *
2604 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2605 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2606 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2607 * halo. It would be necessary to clamp the argument to prevent this.
2608 *
2609 * See also:
2610 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2611 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2612 */
2613 static inline LLVMValueRef
2614 lp_build_rcp_refine(struct lp_build_context *bld,
2615 LLVMValueRef a,
2616 LLVMValueRef rcp_a)
2617 {
2618 LLVMBuilderRef builder = bld->gallivm->builder;
2619 LLVMValueRef neg_a;
2620 LLVMValueRef res;
2621
2622 neg_a = LLVMBuildFNeg(builder, a, "");
2623 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2624 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2625
2626 return res;
2627 }
2628
2629
2630 LLVMValueRef
2631 lp_build_rcp(struct lp_build_context *bld,
2632 LLVMValueRef a)
2633 {
2634 LLVMBuilderRef builder = bld->gallivm->builder;
2635 const struct lp_type type = bld->type;
2636
2637 assert(lp_check_value(type, a));
2638
2639 if(a == bld->zero)
2640 return bld->undef;
2641 if(a == bld->one)
2642 return bld->one;
2643 if(a == bld->undef)
2644 return bld->undef;
2645
2646 assert(type.floating);
2647
2648 if(LLVMIsConstant(a))
2649 return LLVMConstFDiv(bld->one, a);
2650
2651 /*
2652 * We don't use RCPPS because:
2653 * - it only has 10bits of precision
2654 * - it doesn't even get the reciprocate of 1.0 exactly
2655 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2656 * - for recent processors the benefit over DIVPS is marginal, a case
2657 * dependent
2658 *
2659 * We could still use it on certain processors if benchmarks show that the
2660 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2661 * particular uses that require less workarounds.
2662 */
2663
2664 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2665 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2666 const unsigned num_iterations = 0;
2667 LLVMValueRef res;
2668 unsigned i;
2669 const char *intrinsic = NULL;
2670
2671 if (type.length == 4) {
2672 intrinsic = "llvm.x86.sse.rcp.ps";
2673 }
2674 else {
2675 intrinsic = "llvm.x86.avx.rcp.ps.256";
2676 }
2677
2678 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2679
2680 for (i = 0; i < num_iterations; ++i) {
2681 res = lp_build_rcp_refine(bld, a, res);
2682 }
2683
2684 return res;
2685 }
2686
2687 return LLVMBuildFDiv(builder, bld->one, a, "");
2688 }
2689
2690
2691 /**
2692 * Do one Newton-Raphson step to improve rsqrt precision:
2693 *
2694 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2695 *
2696 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2697 */
2698 static inline LLVMValueRef
2699 lp_build_rsqrt_refine(struct lp_build_context *bld,
2700 LLVMValueRef a,
2701 LLVMValueRef rsqrt_a)
2702 {
2703 LLVMBuilderRef builder = bld->gallivm->builder;
2704 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2705 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2706 LLVMValueRef res;
2707
2708 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2709 res = LLVMBuildFMul(builder, a, res, "");
2710 res = LLVMBuildFSub(builder, three, res, "");
2711 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2712 res = LLVMBuildFMul(builder, half, res, "");
2713
2714 return res;
2715 }
2716
2717
2718 /**
2719 * Generate 1/sqrt(a).
2720 * Result is undefined for values < 0, infinity for +0.
2721 */
2722 LLVMValueRef
2723 lp_build_rsqrt(struct lp_build_context *bld,
2724 LLVMValueRef a)
2725 {
2726 const struct lp_type type = bld->type;
2727
2728 assert(lp_check_value(type, a));
2729
2730 assert(type.floating);
2731
2732 /*
2733 * This should be faster but all denormals will end up as infinity.
2734 */
2735 if (0 && lp_build_fast_rsqrt_available(type)) {
2736 const unsigned num_iterations = 1;
2737 LLVMValueRef res;
2738 unsigned i;
2739
2740 /* rsqrt(1.0) != 1.0 here */
2741 res = lp_build_fast_rsqrt(bld, a);
2742
2743 if (num_iterations) {
2744 /*
2745 * Newton-Raphson will result in NaN instead of infinity for zero,
2746 * and NaN instead of zero for infinity.
2747 * Also, need to ensure rsqrt(1.0) == 1.0.
2748 * All numbers smaller than FLT_MIN will result in +infinity
2749 * (rsqrtps treats all denormals as zero).
2750 */
2751 LLVMValueRef cmp;
2752 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2753 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2754
2755 for (i = 0; i < num_iterations; ++i) {
2756 res = lp_build_rsqrt_refine(bld, a, res);
2757 }
2758 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2759 res = lp_build_select(bld, cmp, inf, res);
2760 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2761 res = lp_build_select(bld, cmp, bld->zero, res);
2762 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2763 res = lp_build_select(bld, cmp, bld->one, res);
2764 }
2765
2766 return res;
2767 }
2768
2769 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2770 }
2771
2772 /**
2773 * If there's a fast (inaccurate) rsqrt instruction available
2774 * (caller may want to avoid to call rsqrt_fast if it's not available,
2775 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2776 * unavailable it would result in sqrt/div/mul so obviously
2777 * much better to just call sqrt, skipping both div and mul).
2778 */
2779 boolean
2780 lp_build_fast_rsqrt_available(struct lp_type type)
2781 {
2782 assert(type.floating);
2783
2784 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2785 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2786 return true;
2787 }
2788 return false;
2789 }
2790
2791
2792 /**
2793 * Generate 1/sqrt(a).
2794 * Result is undefined for values < 0, infinity for +0.
2795 * Precision is limited, only ~10 bits guaranteed
2796 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2797 */
2798 LLVMValueRef
2799 lp_build_fast_rsqrt(struct lp_build_context *bld,
2800 LLVMValueRef a)
2801 {
2802 LLVMBuilderRef builder = bld->gallivm->builder;
2803 const struct lp_type type = bld->type;
2804
2805 assert(lp_check_value(type, a));
2806
2807 if (lp_build_fast_rsqrt_available(type)) {
2808 const char *intrinsic = NULL;
2809
2810 if (type.length == 4) {
2811 intrinsic = "llvm.x86.sse.rsqrt.ps";
2812 }
2813 else {
2814 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2815 }
2816 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2817 }
2818 else {
2819 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2820 }
2821 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2822 }
2823
2824
2825 /**
2826 * Generate sin(a) or cos(a) using polynomial approximation.
2827 * TODO: it might be worth recognizing sin and cos using same source
2828 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2829 * would be way cheaper than calculating (nearly) everything twice...
2830 * Not sure it's common enough to be worth bothering however, scs
2831 * opcode could also benefit from calculating both though.
2832 */
2833 static LLVMValueRef
2834 lp_build_sin_or_cos(struct lp_build_context *bld,
2835 LLVMValueRef a,
2836 boolean cos)
2837 {
2838 struct gallivm_state *gallivm = bld->gallivm;
2839 LLVMBuilderRef b = gallivm->builder;
2840 struct lp_type int_type = lp_int_type(bld->type);
2841
2842 /*
2843 * take the absolute value,
2844 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2845 */
2846
2847 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2848 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2849
2850 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2851 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2852
2853 /*
2854 * scale by 4/Pi
2855 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2856 */
2857
2858 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2859 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2860
2861 /*
2862 * store the integer part of y in mm0
2863 * emm2 = _mm_cvttps_epi32(y);
2864 */
2865
2866 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2867
2868 /*
2869 * j=(j+1) & (~1) (see the cephes sources)
2870 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2871 */
2872
2873 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2874 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2875 /*
2876 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2877 */
2878 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2879 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2880
2881 /*
2882 * y = _mm_cvtepi32_ps(emm2);
2883 */
2884 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2885
2886 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2887 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2888 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2889 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2890
2891 /*
2892 * Argument used for poly selection and sign bit determination
2893 * is different for sin vs. cos.
2894 */
2895 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2896 emm2_and;
2897
2898 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2899 LLVMBuildNot(b, emm2_2, ""), ""),
2900 const_29, "sign_bit") :
2901 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2902 LLVMBuildShl(b, emm2_add,
2903 const_29, ""), ""),
2904 sign_mask, "sign_bit");
2905
2906 /*
2907 * get the polynom selection mask
2908 * there is one polynom for 0 <= x <= Pi/4
2909 * and another one for Pi/4<x<=Pi/2
2910 * Both branches will be computed.
2911 *
2912 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2913 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2914 */
2915
2916 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2917 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2918 int_type, PIPE_FUNC_EQUAL,
2919 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2920
2921 /*
2922 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2923 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2924 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2925 */
2926 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2927 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2928 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2929
2930 /*
2931 * The magic pass: "Extended precision modular arithmetic"
2932 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2933 */
2934 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2935 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2936 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2937
2938 /*
2939 * Evaluate the first polynom (0 <= x <= Pi/4)
2940 *
2941 * z = _mm_mul_ps(x,x);
2942 */
2943 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2944
2945 /*
2946 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2947 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2948 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2949 */
2950 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2951 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2952 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2953
2954 /*
2955 * y = *(v4sf*)_ps_coscof_p0;
2956 * y = _mm_mul_ps(y, z);
2957 */
2958 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2959 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2960 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2961 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2962
2963
2964 /*
2965 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2966 * y = _mm_sub_ps(y, tmp);
2967 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2968 */
2969 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2970 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2971 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2972 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2973 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2974
2975 /*
2976 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2977 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2978 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2979 */
2980 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2981 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2982 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2983
2984 /*
2985 * Evaluate the second polynom (Pi/4 <= x <= 0)
2986 *
2987 * y2 = *(v4sf*)_ps_sincof_p0;
2988 * y2 = _mm_mul_ps(y2, z);
2989 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2990 * y2 = _mm_mul_ps(y2, z);
2991 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2992 * y2 = _mm_mul_ps(y2, z);
2993 * y2 = _mm_mul_ps(y2, x);
2994 * y2 = _mm_add_ps(y2, x);
2995 */
2996
2997 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2998 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2999 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3000 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3001
3002 /*
3003 * select the correct result from the two polynoms
3004 * xmm3 = poly_mask;
3005 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3006 * y = _mm_andnot_ps(xmm3, y);
3007 * y = _mm_or_ps(y,y2);
3008 */
3009 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3010 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3011 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3012 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3013 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3014 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3015
3016 /*
3017 * update the sign
3018 * y = _mm_xor_ps(y, sign_bit);
3019 */
3020 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3021 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3022
3023 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3024
3025 /* clamp output to be within [-1, 1] */
3026 y_result = lp_build_clamp(bld, y_result,
3027 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3028 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3029 /* If a is -inf, inf or NaN then return NaN */
3030 y_result = lp_build_select(bld, isfinite, y_result,
3031 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3032 return y_result;
3033 }
3034
3035
3036 /**
3037 * Generate sin(a)
3038 */
3039 LLVMValueRef
3040 lp_build_sin(struct lp_build_context *bld,
3041 LLVMValueRef a)
3042 {
3043 return lp_build_sin_or_cos(bld, a, FALSE);
3044 }
3045
3046
3047 /**
3048 * Generate cos(a)
3049 */
3050 LLVMValueRef
3051 lp_build_cos(struct lp_build_context *bld,
3052 LLVMValueRef a)
3053 {
3054 return lp_build_sin_or_cos(bld, a, TRUE);
3055 }
3056
3057
3058 /**
3059 * Generate pow(x, y)
3060 */
3061 LLVMValueRef
3062 lp_build_pow(struct lp_build_context *bld,
3063 LLVMValueRef x,
3064 LLVMValueRef y)
3065 {
3066 /* TODO: optimize the constant case */
3067 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3068 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3069 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3070 __FUNCTION__);
3071 }
3072
3073 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3074 }
3075
3076
3077 /**
3078 * Generate exp(x)
3079 */
3080 LLVMValueRef
3081 lp_build_exp(struct lp_build_context *bld,
3082 LLVMValueRef x)
3083 {
3084 /* log2(e) = 1/log(2) */
3085 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3086 1.4426950408889634);
3087
3088 assert(lp_check_value(bld->type, x));
3089
3090 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3091 }
3092
3093
3094 /**
3095 * Generate log(x)
3096 * Behavior is undefined with infs, 0s and nans
3097 */
3098 LLVMValueRef
3099 lp_build_log(struct lp_build_context *bld,
3100 LLVMValueRef x)
3101 {
3102 /* log(2) */
3103 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3104 0.69314718055994529);
3105
3106 assert(lp_check_value(bld->type, x));
3107
3108 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3109 }
3110
3111 /**
3112 * Generate log(x) that handles edge cases (infs, 0s and nans)
3113 */
3114 LLVMValueRef
3115 lp_build_log_safe(struct lp_build_context *bld,
3116 LLVMValueRef x)
3117 {
3118 /* log(2) */
3119 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3120 0.69314718055994529);
3121
3122 assert(lp_check_value(bld->type, x));
3123
3124 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3125 }
3126
3127
3128 /**
3129 * Generate polynomial.
3130 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3131 */
3132 LLVMValueRef
3133 lp_build_polynomial(struct lp_build_context *bld,
3134 LLVMValueRef x,
3135 const double *coeffs,
3136 unsigned num_coeffs)
3137 {
3138 const struct lp_type type = bld->type;
3139 LLVMValueRef even = NULL, odd = NULL;
3140 LLVMValueRef x2;
3141 unsigned i;
3142
3143 assert(lp_check_value(bld->type, x));
3144
3145 /* TODO: optimize the constant case */
3146 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3147 LLVMIsConstant(x)) {
3148 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3149 __FUNCTION__);
3150 }
3151
3152 /*
3153 * Calculate odd and even terms seperately to decrease data dependency
3154 * Ex:
3155 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3156 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3157 */
3158 x2 = lp_build_mul(bld, x, x);
3159
3160 for (i = num_coeffs; i--; ) {
3161 LLVMValueRef coeff;
3162
3163 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3164
3165 if (i % 2 == 0) {
3166 if (even)
3167 even = lp_build_mad(bld, x2, even, coeff);
3168 else
3169 even = coeff;
3170 } else {
3171 if (odd)
3172 odd = lp_build_mad(bld, x2, odd, coeff);
3173 else
3174 odd = coeff;
3175 }
3176 }
3177
3178 if (odd)
3179 return lp_build_mad(bld, odd, x, even);
3180 else if (even)
3181 return even;
3182 else
3183 return bld->undef;
3184 }
3185
3186
3187 /**
3188 * Minimax polynomial fit of 2**x, in range [0, 1[
3189 */
3190 const double lp_build_exp2_polynomial[] = {
3191 #if EXP_POLY_DEGREE == 5
3192 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3193 0.693153073200168932794,
3194 0.240153617044375388211,
3195 0.0558263180532956664775,
3196 0.00898934009049466391101,
3197 0.00187757667519147912699
3198 #elif EXP_POLY_DEGREE == 4
3199 1.00000259337069434683,
3200 0.693003834469974940458,
3201 0.24144275689150793076,
3202 0.0520114606103070150235,
3203 0.0135341679161270268764
3204 #elif EXP_POLY_DEGREE == 3
3205 0.999925218562710312959,
3206 0.695833540494823811697,
3207 0.226067155427249155588,
3208 0.0780245226406372992967
3209 #elif EXP_POLY_DEGREE == 2
3210 1.00172476321474503578,
3211 0.657636275736077639316,
3212 0.33718943461968720704
3213 #else
3214 #error
3215 #endif
3216 };
3217
3218
3219 LLVMValueRef
3220 lp_build_exp2(struct lp_build_context *bld,
3221 LLVMValueRef x)
3222 {
3223 LLVMBuilderRef builder = bld->gallivm->builder;
3224 const struct lp_type type = bld->type;
3225 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3226 LLVMValueRef ipart = NULL;
3227 LLVMValueRef fpart = NULL;
3228 LLVMValueRef expipart = NULL;
3229 LLVMValueRef expfpart = NULL;
3230 LLVMValueRef res = NULL;
3231
3232 assert(lp_check_value(bld->type, x));
3233
3234 /* TODO: optimize the constant case */
3235 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3236 LLVMIsConstant(x)) {
3237 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3238 __FUNCTION__);
3239 }
3240
3241 assert(type.floating && type.width == 32);
3242
3243 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3244 * the result is INF and if it's smaller than -126.9 the result is 0 */
3245 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3246 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3247 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3248 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3249
3250 /* ipart = floor(x) */
3251 /* fpart = x - ipart */
3252 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3253
3254 /* expipart = (float) (1 << ipart) */
3255 expipart = LLVMBuildAdd(builder, ipart,
3256 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3257 expipart = LLVMBuildShl(builder, expipart,
3258 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3259 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3260
3261 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3262 ARRAY_SIZE(lp_build_exp2_polynomial));
3263
3264 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3265
3266 return res;
3267 }
3268
3269
3270
3271 /**
3272 * Extract the exponent of a IEEE-754 floating point value.
3273 *
3274 * Optionally apply an integer bias.
3275 *
3276 * Result is an integer value with
3277 *
3278 * ifloor(log2(x)) + bias
3279 */
3280 LLVMValueRef
3281 lp_build_extract_exponent(struct lp_build_context *bld,
3282 LLVMValueRef x,
3283 int bias)
3284 {
3285 LLVMBuilderRef builder = bld->gallivm->builder;
3286 const struct lp_type type = bld->type;
3287 unsigned mantissa = lp_mantissa(type);
3288 LLVMValueRef res;
3289
3290 assert(type.floating);
3291
3292 assert(lp_check_value(bld->type, x));
3293
3294 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3295
3296 res = LLVMBuildLShr(builder, x,
3297 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3298 res = LLVMBuildAnd(builder, res,
3299 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3300 res = LLVMBuildSub(builder, res,
3301 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3302
3303 return res;
3304 }
3305
3306
3307 /**
3308 * Extract the mantissa of the a floating.
3309 *
3310 * Result is a floating point value with
3311 *
3312 * x / floor(log2(x))
3313 */
3314 LLVMValueRef
3315 lp_build_extract_mantissa(struct lp_build_context *bld,
3316 LLVMValueRef x)
3317 {
3318 LLVMBuilderRef builder = bld->gallivm->builder;
3319 const struct lp_type type = bld->type;
3320 unsigned mantissa = lp_mantissa(type);
3321 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3322 (1ULL << mantissa) - 1);
3323 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3324 LLVMValueRef res;
3325
3326 assert(lp_check_value(bld->type, x));
3327
3328 assert(type.floating);
3329
3330 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3331
3332 /* res = x / 2**ipart */
3333 res = LLVMBuildAnd(builder, x, mantmask, "");
3334 res = LLVMBuildOr(builder, res, one, "");
3335 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3336
3337 return res;
3338 }
3339
3340
3341
3342 /**
3343 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3344 * These coefficients can be generate with
3345 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3346 */
3347 const double lp_build_log2_polynomial[] = {
3348 #if LOG_POLY_DEGREE == 5
3349 2.88539008148777786488L,
3350 0.961796878841293367824L,
3351 0.577058946784739859012L,
3352 0.412914355135828735411L,
3353 0.308591899232910175289L,
3354 0.352376952300281371868L,
3355 #elif LOG_POLY_DEGREE == 4
3356 2.88539009343309178325L,
3357 0.961791550404184197881L,
3358 0.577440339438736392009L,
3359 0.403343858251329912514L,
3360 0.406718052498846252698L,
3361 #elif LOG_POLY_DEGREE == 3
3362 2.88538959748872753838L,
3363 0.961932915889597772928L,
3364 0.571118517972136195241L,
3365 0.493997535084709500285L,
3366 #else
3367 #error
3368 #endif
3369 };
3370
3371 /**
3372 * See http://www.devmaster.net/forums/showthread.php?p=43580
3373 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3374 * http://www.nezumi.demon.co.uk/consult/logx.htm
3375 *
3376 * If handle_edge_cases is true the function will perform computations
3377 * to match the required D3D10+ behavior for each of the edge cases.
3378 * That means that if input is:
3379 * - less than zero (to and including -inf) then NaN will be returned
3380 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3381 * - +infinity, then +infinity will be returned
3382 * - NaN, then NaN will be returned
3383 *
3384 * Those checks are fairly expensive so if you don't need them make sure
3385 * handle_edge_cases is false.
3386 */
3387 void
3388 lp_build_log2_approx(struct lp_build_context *bld,
3389 LLVMValueRef x,
3390 LLVMValueRef *p_exp,
3391 LLVMValueRef *p_floor_log2,
3392 LLVMValueRef *p_log2,
3393 boolean handle_edge_cases)
3394 {
3395 LLVMBuilderRef builder = bld->gallivm->builder;
3396 const struct lp_type type = bld->type;
3397 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3398 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3399
3400 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3401 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3402 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3403
3404 LLVMValueRef i = NULL;
3405 LLVMValueRef y = NULL;
3406 LLVMValueRef z = NULL;
3407 LLVMValueRef exp = NULL;
3408 LLVMValueRef mant = NULL;
3409 LLVMValueRef logexp = NULL;
3410 LLVMValueRef p_z = NULL;
3411 LLVMValueRef res = NULL;
3412
3413 assert(lp_check_value(bld->type, x));
3414
3415 if(p_exp || p_floor_log2 || p_log2) {
3416 /* TODO: optimize the constant case */
3417 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3418 LLVMIsConstant(x)) {
3419 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3420 __FUNCTION__);
3421 }
3422
3423 assert(type.floating && type.width == 32);
3424
3425 /*
3426 * We don't explicitly handle denormalized numbers. They will yield a
3427 * result in the neighbourhood of -127, which appears to be adequate
3428 * enough.
3429 */
3430
3431 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3432
3433 /* exp = (float) exponent(x) */
3434 exp = LLVMBuildAnd(builder, i, expmask, "");
3435 }
3436
3437 if(p_floor_log2 || p_log2) {
3438 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3439 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3440 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3441 }
3442
3443 if (p_log2) {
3444 /* mant = 1 + (float) mantissa(x) */
3445 mant = LLVMBuildAnd(builder, i, mantmask, "");
3446 mant = LLVMBuildOr(builder, mant, one, "");
3447 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3448
3449 /* y = (mant - 1) / (mant + 1) */
3450 y = lp_build_div(bld,
3451 lp_build_sub(bld, mant, bld->one),
3452 lp_build_add(bld, mant, bld->one)
3453 );
3454
3455 /* z = y^2 */
3456 z = lp_build_mul(bld, y, y);
3457
3458 /* compute P(z) */
3459 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3460 ARRAY_SIZE(lp_build_log2_polynomial));
3461
3462 /* y * P(z) + logexp */
3463 res = lp_build_mad(bld, y, p_z, logexp);
3464
3465 if (type.floating && handle_edge_cases) {
3466 LLVMValueRef negmask, infmask, zmask;
3467 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3468 lp_build_const_vec(bld->gallivm, type, 0.0f));
3469 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3470 lp_build_const_vec(bld->gallivm, type, 0.0f));
3471 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3472 lp_build_const_vec(bld->gallivm, type, INFINITY));
3473
3474 /* If x is qual to inf make sure we return inf */
3475 res = lp_build_select(bld, infmask,
3476 lp_build_const_vec(bld->gallivm, type, INFINITY),
3477 res);
3478 /* If x is qual to 0, return -inf */
3479 res = lp_build_select(bld, zmask,
3480 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3481 res);
3482 /* If x is nan or less than 0, return nan */
3483 res = lp_build_select(bld, negmask,
3484 lp_build_const_vec(bld->gallivm, type, NAN),
3485 res);
3486 }
3487 }
3488
3489 if (p_exp) {
3490 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3491 *p_exp = exp;
3492 }
3493
3494 if (p_floor_log2)
3495 *p_floor_log2 = logexp;
3496
3497 if (p_log2)
3498 *p_log2 = res;
3499 }
3500
3501
3502 /*
3503 * log2 implementation which doesn't have special code to
3504 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3505 * the results for those cases are undefined.
3506 */
3507 LLVMValueRef
3508 lp_build_log2(struct lp_build_context *bld,
3509 LLVMValueRef x)
3510 {
3511 LLVMValueRef res;
3512 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3513 return res;
3514 }
3515
3516 /*
3517 * Version of log2 which handles all edge cases.
3518 * Look at documentation of lp_build_log2_approx for
3519 * description of the behavior for each of the edge cases.
3520 */
3521 LLVMValueRef
3522 lp_build_log2_safe(struct lp_build_context *bld,
3523 LLVMValueRef x)
3524 {
3525 LLVMValueRef res;
3526 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3527 return res;
3528 }
3529
3530
3531 /**
3532 * Faster (and less accurate) log2.
3533 *
3534 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3535 *
3536 * Piece-wise linear approximation, with exact results when x is a
3537 * power of two.
3538 *
3539 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3540 */
3541 LLVMValueRef
3542 lp_build_fast_log2(struct lp_build_context *bld,
3543 LLVMValueRef x)
3544 {
3545 LLVMBuilderRef builder = bld->gallivm->builder;
3546 LLVMValueRef ipart;
3547 LLVMValueRef fpart;
3548
3549 assert(lp_check_value(bld->type, x));
3550
3551 assert(bld->type.floating);
3552
3553 /* ipart = floor(log2(x)) - 1 */
3554 ipart = lp_build_extract_exponent(bld, x, -1);
3555 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3556
3557 /* fpart = x / 2**ipart */
3558 fpart = lp_build_extract_mantissa(bld, x);
3559
3560 /* ipart + fpart */
3561 return LLVMBuildFAdd(builder, ipart, fpart, "");
3562 }
3563
3564
3565 /**
3566 * Fast implementation of iround(log2(x)).
3567 *
3568 * Not an approximation -- it should give accurate results all the time.
3569 */
3570 LLVMValueRef
3571 lp_build_ilog2(struct lp_build_context *bld,
3572 LLVMValueRef x)
3573 {
3574 LLVMBuilderRef builder = bld->gallivm->builder;
3575 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3576 LLVMValueRef ipart;
3577
3578 assert(bld->type.floating);
3579
3580 assert(lp_check_value(bld->type, x));
3581
3582 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3583 x = LLVMBuildFMul(builder, x, sqrt2, "");
3584
3585 /* ipart = floor(log2(x) + 0.5) */
3586 ipart = lp_build_extract_exponent(bld, x, 0);
3587
3588 return ipart;
3589 }
3590
3591 LLVMValueRef
3592 lp_build_mod(struct lp_build_context *bld,
3593 LLVMValueRef x,
3594 LLVMValueRef y)
3595 {
3596 LLVMBuilderRef builder = bld->gallivm->builder;
3597 LLVMValueRef res;
3598 const struct lp_type type = bld->type;
3599
3600 assert(lp_check_value(type, x));
3601 assert(lp_check_value(type, y));
3602
3603 if (type.floating)
3604 res = LLVMBuildFRem(builder, x, y, "");
3605 else if (type.sign)
3606 res = LLVMBuildSRem(builder, x, y, "");
3607 else
3608 res = LLVMBuildURem(builder, x, y, "");
3609 return res;
3610 }
3611
3612
3613 /*
3614 * For floating inputs it creates and returns a mask
3615 * which is all 1's for channels which are NaN.
3616 * Channels inside x which are not NaN will be 0.
3617 */
3618 LLVMValueRef
3619 lp_build_isnan(struct lp_build_context *bld,
3620 LLVMValueRef x)
3621 {
3622 LLVMValueRef mask;
3623 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3624
3625 assert(bld->type.floating);
3626 assert(lp_check_value(bld->type, x));
3627
3628 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3629 "isnotnan");
3630 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3631 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3632 return mask;
3633 }
3634
3635 /* Returns all 1's for floating point numbers that are
3636 * finite numbers and returns all zeros for -inf,
3637 * inf and nan's */
3638 LLVMValueRef
3639 lp_build_isfinite(struct lp_build_context *bld,
3640 LLVMValueRef x)
3641 {
3642 LLVMBuilderRef builder = bld->gallivm->builder;
3643 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3644 struct lp_type int_type = lp_int_type(bld->type);
3645 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3646 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3647 0x7f800000);
3648
3649 if (!bld->type.floating) {
3650 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3651 }
3652 assert(bld->type.floating);
3653 assert(lp_check_value(bld->type, x));
3654 assert(bld->type.width == 32);
3655
3656 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3657 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3658 intx, infornan32);
3659 }
3660
3661 /*
3662 * Returns true if the number is nan or inf and false otherwise.
3663 * The input has to be a floating point vector.
3664 */
3665 LLVMValueRef
3666 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3667 const struct lp_type type,
3668 LLVMValueRef x)
3669 {
3670 LLVMBuilderRef builder = gallivm->builder;
3671 struct lp_type int_type = lp_int_type(type);
3672 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3673 0x7f800000);
3674 LLVMValueRef ret;
3675
3676 assert(type.floating);
3677
3678 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3679 ret = LLVMBuildAnd(builder, ret, const0, "");
3680 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3681 ret, const0);
3682
3683 return ret;
3684 }
3685
3686
3687 LLVMValueRef
3688 lp_build_fpstate_get(struct gallivm_state *gallivm)
3689 {
3690 if (util_cpu_caps.has_sse) {
3691 LLVMBuilderRef builder = gallivm->builder;
3692 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3693 gallivm,
3694 LLVMInt32TypeInContext(gallivm->context),
3695 "mxcsr_ptr");
3696 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3697 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3698 lp_build_intrinsic(builder,
3699 "llvm.x86.sse.stmxcsr",
3700 LLVMVoidTypeInContext(gallivm->context),
3701 &mxcsr_ptr8, 1, 0);
3702 return mxcsr_ptr;
3703 }
3704 return 0;
3705 }
3706
3707 void
3708 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3709 boolean zero)
3710 {
3711 if (util_cpu_caps.has_sse) {
3712 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3713 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3714
3715 LLVMBuilderRef builder = gallivm->builder;
3716 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3717 LLVMValueRef mxcsr =
3718 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3719
3720 if (util_cpu_caps.has_daz) {
3721 /* Enable denormals are zero mode */
3722 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3723 }
3724 if (zero) {
3725 mxcsr = LLVMBuildOr(builder, mxcsr,
3726 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3727 } else {
3728 mxcsr = LLVMBuildAnd(builder, mxcsr,
3729 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3730 }
3731
3732 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3733 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3734 }
3735 }
3736
3737 void
3738 lp_build_fpstate_set(struct gallivm_state *gallivm,
3739 LLVMValueRef mxcsr_ptr)
3740 {
3741 if (util_cpu_caps.has_sse) {
3742 LLVMBuilderRef builder = gallivm->builder;
3743 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3744 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3745 lp_build_intrinsic(builder,
3746 "llvm.x86.sse.ldmxcsr",
3747 LLVMVoidTypeInContext(gallivm->context),
3748 &mxcsr_ptr, 1, 0);
3749 }
3750 }