gallium: Require LLVM >= 3.9
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_cpu_caps.has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_cpu_caps.has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_cpu_caps.has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_cpu_caps.has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
139 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
140 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 __FUNCTION__);
142 }
143 if (type.width == 32 && type.length == 4) {
144 intrinsic = "llvm.ppc.altivec.vminfp";
145 intr_size = 128;
146 }
147 } else if (util_cpu_caps.has_altivec) {
148 intr_size = 128;
149 if (type.width == 8) {
150 if (!type.sign) {
151 intrinsic = "llvm.ppc.altivec.vminub";
152 } else {
153 intrinsic = "llvm.ppc.altivec.vminsb";
154 }
155 } else if (type.width == 16) {
156 if (!type.sign) {
157 intrinsic = "llvm.ppc.altivec.vminuh";
158 } else {
159 intrinsic = "llvm.ppc.altivec.vminsh";
160 }
161 } else if (type.width == 32) {
162 if (!type.sign) {
163 intrinsic = "llvm.ppc.altivec.vminuw";
164 } else {
165 intrinsic = "llvm.ppc.altivec.vminsw";
166 }
167 }
168 }
169
170 if (intrinsic) {
171 /* We need to handle nan's for floating point numbers. If one of the
172 * inputs is nan the other should be returned (required by both D3D10+
173 * and OpenCL).
174 * The sse intrinsics return the second operator in case of nan by
175 * default so we need to special code to handle those.
176 */
177 if (util_cpu_caps.has_sse && type.floating &&
178 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
179 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
180 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
181 LLVMValueRef isnan, min;
182 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
183 type,
184 intr_size, a, b);
185 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
186 isnan = lp_build_isnan(bld, b);
187 return lp_build_select(bld, isnan, a, min);
188 } else {
189 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
190 isnan = lp_build_isnan(bld, a);
191 return lp_build_select(bld, isnan, a, min);
192 }
193 } else {
194 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
195 type,
196 intr_size, a, b);
197 }
198 }
199
200 if (type.floating) {
201 switch (nan_behavior) {
202 case GALLIVM_NAN_RETURN_NAN: {
203 LLVMValueRef isnan = lp_build_isnan(bld, b);
204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
205 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
206 return lp_build_select(bld, cond, a, b);
207 }
208 break;
209 case GALLIVM_NAN_RETURN_OTHER: {
210 LLVMValueRef isnan = lp_build_isnan(bld, a);
211 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
212 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
213 return lp_build_select(bld, cond, a, b);
214 }
215 break;
216 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
217 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
218 return lp_build_select(bld, cond, a, b);
219 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
220 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
221 return lp_build_select(bld, cond, b, a);
222 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
223 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
224 return lp_build_select(bld, cond, a, b);
225 break;
226 default:
227 assert(0);
228 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
229 return lp_build_select(bld, cond, a, b);
230 }
231 } else {
232 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
233 return lp_build_select(bld, cond, a, b);
234 }
235 }
236
237
238 LLVMValueRef
239 lp_build_fmuladd(LLVMBuilderRef builder,
240 LLVMValueRef a,
241 LLVMValueRef b,
242 LLVMValueRef c)
243 {
244 LLVMTypeRef type = LLVMTypeOf(a);
245 assert(type == LLVMTypeOf(b));
246 assert(type == LLVMTypeOf(c));
247
248 char intrinsic[32];
249 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
250 LLVMValueRef args[] = { a, b, c };
251 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
252 }
253
254
255 /**
256 * Generate max(a, b)
257 * No checks for special case values of a or b = 1 or 0 are done.
258 * NaN's are handled according to the behavior specified by the
259 * nan_behavior argument.
260 */
261 static LLVMValueRef
262 lp_build_max_simple(struct lp_build_context *bld,
263 LLVMValueRef a,
264 LLVMValueRef b,
265 enum gallivm_nan_behavior nan_behavior)
266 {
267 const struct lp_type type = bld->type;
268 const char *intrinsic = NULL;
269 unsigned intr_size = 0;
270 LLVMValueRef cond;
271
272 assert(lp_check_value(type, a));
273 assert(lp_check_value(type, b));
274
275 /* TODO: optimize the constant case */
276
277 if (type.floating && util_cpu_caps.has_sse) {
278 if (type.width == 32) {
279 if (type.length == 1) {
280 intrinsic = "llvm.x86.sse.max.ss";
281 intr_size = 128;
282 }
283 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
284 intrinsic = "llvm.x86.sse.max.ps";
285 intr_size = 128;
286 }
287 else {
288 intrinsic = "llvm.x86.avx.max.ps.256";
289 intr_size = 256;
290 }
291 }
292 if (type.width == 64 && util_cpu_caps.has_sse2) {
293 if (type.length == 1) {
294 intrinsic = "llvm.x86.sse2.max.sd";
295 intr_size = 128;
296 }
297 else if (type.length == 2 || !util_cpu_caps.has_avx) {
298 intrinsic = "llvm.x86.sse2.max.pd";
299 intr_size = 128;
300 }
301 else {
302 intrinsic = "llvm.x86.avx.max.pd.256";
303 intr_size = 256;
304 }
305 }
306 }
307 else if (type.floating && util_cpu_caps.has_altivec) {
308 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
309 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
310 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
311 __FUNCTION__);
312 }
313 if (type.width == 32 || type.length == 4) {
314 intrinsic = "llvm.ppc.altivec.vmaxfp";
315 intr_size = 128;
316 }
317 } else if (util_cpu_caps.has_altivec) {
318 intr_size = 128;
319 if (type.width == 8) {
320 if (!type.sign) {
321 intrinsic = "llvm.ppc.altivec.vmaxub";
322 } else {
323 intrinsic = "llvm.ppc.altivec.vmaxsb";
324 }
325 } else if (type.width == 16) {
326 if (!type.sign) {
327 intrinsic = "llvm.ppc.altivec.vmaxuh";
328 } else {
329 intrinsic = "llvm.ppc.altivec.vmaxsh";
330 }
331 } else if (type.width == 32) {
332 if (!type.sign) {
333 intrinsic = "llvm.ppc.altivec.vmaxuw";
334 } else {
335 intrinsic = "llvm.ppc.altivec.vmaxsw";
336 }
337 }
338 }
339
340 if (intrinsic) {
341 if (util_cpu_caps.has_sse && type.floating &&
342 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
343 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
344 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
345 LLVMValueRef isnan, max;
346 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
347 type,
348 intr_size, a, b);
349 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
350 isnan = lp_build_isnan(bld, b);
351 return lp_build_select(bld, isnan, a, max);
352 } else {
353 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
354 isnan = lp_build_isnan(bld, a);
355 return lp_build_select(bld, isnan, a, max);
356 }
357 } else {
358 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
359 type,
360 intr_size, a, b);
361 }
362 }
363
364 if (type.floating) {
365 switch (nan_behavior) {
366 case GALLIVM_NAN_RETURN_NAN: {
367 LLVMValueRef isnan = lp_build_isnan(bld, b);
368 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
369 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
370 return lp_build_select(bld, cond, a, b);
371 }
372 break;
373 case GALLIVM_NAN_RETURN_OTHER: {
374 LLVMValueRef isnan = lp_build_isnan(bld, a);
375 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
376 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
377 return lp_build_select(bld, cond, a, b);
378 }
379 break;
380 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
381 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
382 return lp_build_select(bld, cond, a, b);
383 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
384 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
385 return lp_build_select(bld, cond, b, a);
386 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
387 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
388 return lp_build_select(bld, cond, a, b);
389 break;
390 default:
391 assert(0);
392 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
393 return lp_build_select(bld, cond, a, b);
394 }
395 } else {
396 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
397 return lp_build_select(bld, cond, a, b);
398 }
399 }
400
401
402 /**
403 * Generate 1 - a, or ~a depending on bld->type.
404 */
405 LLVMValueRef
406 lp_build_comp(struct lp_build_context *bld,
407 LLVMValueRef a)
408 {
409 LLVMBuilderRef builder = bld->gallivm->builder;
410 const struct lp_type type = bld->type;
411
412 assert(lp_check_value(type, a));
413
414 if(a == bld->one)
415 return bld->zero;
416 if(a == bld->zero)
417 return bld->one;
418
419 if(type.norm && !type.floating && !type.fixed && !type.sign) {
420 if(LLVMIsConstant(a))
421 return LLVMConstNot(a);
422 else
423 return LLVMBuildNot(builder, a, "");
424 }
425
426 if(LLVMIsConstant(a))
427 if (type.floating)
428 return LLVMConstFSub(bld->one, a);
429 else
430 return LLVMConstSub(bld->one, a);
431 else
432 if (type.floating)
433 return LLVMBuildFSub(builder, bld->one, a, "");
434 else
435 return LLVMBuildSub(builder, bld->one, a, "");
436 }
437
438
439 /**
440 * Generate a + b
441 */
442 LLVMValueRef
443 lp_build_add(struct lp_build_context *bld,
444 LLVMValueRef a,
445 LLVMValueRef b)
446 {
447 LLVMBuilderRef builder = bld->gallivm->builder;
448 const struct lp_type type = bld->type;
449 LLVMValueRef res;
450
451 assert(lp_check_value(type, a));
452 assert(lp_check_value(type, b));
453
454 if (a == bld->zero)
455 return b;
456 if (b == bld->zero)
457 return a;
458 if (a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if (type.norm) {
462 const char *intrinsic = NULL;
463
464 if (!type.sign && (a == bld->one || b == bld->one))
465 return bld->one;
466
467 if (!type.floating && !type.fixed) {
468 if (LLVM_VERSION_MAJOR >= 9) {
469 char intrin[32];
470 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
471 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
472 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
473 }
474 if (type.width * type.length == 128) {
475 if (util_cpu_caps.has_sse2) {
476 if (type.width == 8)
477 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
478 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.b" : NULL;
479 if (type.width == 16)
480 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
481 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.w" : NULL;
482 } else if (util_cpu_caps.has_altivec) {
483 if (type.width == 8)
484 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
485 if (type.width == 16)
486 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
487 }
488 }
489 if (type.width * type.length == 256) {
490 if (util_cpu_caps.has_avx2) {
491 if (type.width == 8)
492 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
493 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.b" : NULL;
494 if (type.width == 16)
495 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
496 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.w" : NULL;
497 }
498 }
499 }
500
501 if (intrinsic)
502 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
503 }
504
505 if(type.norm && !type.floating && !type.fixed) {
506 if (type.sign) {
507 uint64_t sign = (uint64_t)1 << (type.width - 1);
508 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
509 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
510 /* a_clamp_max is the maximum a for positive b,
511 a_clamp_min is the minimum a for negative b. */
512 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
513 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
514 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
515 }
516 }
517
518 if(LLVMIsConstant(a) && LLVMIsConstant(b))
519 if (type.floating)
520 res = LLVMConstFAdd(a, b);
521 else
522 res = LLVMConstAdd(a, b);
523 else
524 if (type.floating)
525 res = LLVMBuildFAdd(builder, a, b, "");
526 else
527 res = LLVMBuildAdd(builder, a, b, "");
528
529 /* clamp to ceiling of 1.0 */
530 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
531 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
532
533 if (type.norm && !type.floating && !type.fixed) {
534 if (!type.sign) {
535 /*
536 * newer llvm versions no longer support the intrinsics, but recognize
537 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
538 * code, it is important we match the pattern llvm uses (and pray llvm
539 * doesn't change it - and hope they decide on the same pattern for
540 * all backends supporting it...).
541 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
542 * interfere with llvm's ability to recognize the pattern but seems
543 * a bit brittle.
544 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
545 */
546 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
547 res = lp_build_select(bld, overflowed,
548 LLVMConstAllOnes(bld->int_vec_type), res);
549 }
550 }
551
552 /* XXX clamp to floor of -1 or 0??? */
553
554 return res;
555 }
556
557
558 /** Return the scalar sum of the elements of a.
559 * Should avoid this operation whenever possible.
560 */
561 LLVMValueRef
562 lp_build_horizontal_add(struct lp_build_context *bld,
563 LLVMValueRef a)
564 {
565 LLVMBuilderRef builder = bld->gallivm->builder;
566 const struct lp_type type = bld->type;
567 LLVMValueRef index, res;
568 unsigned i, length;
569 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
570 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
571 LLVMValueRef vecres, elem2;
572
573 assert(lp_check_value(type, a));
574
575 if (type.length == 1) {
576 return a;
577 }
578
579 assert(!bld->type.norm);
580
581 /*
582 * for byte vectors can do much better with psadbw.
583 * Using repeated shuffle/adds here. Note with multiple vectors
584 * this can be done more efficiently as outlined in the intel
585 * optimization manual.
586 * Note: could cause data rearrangement if used with smaller element
587 * sizes.
588 */
589
590 vecres = a;
591 length = type.length / 2;
592 while (length > 1) {
593 LLVMValueRef vec1, vec2;
594 for (i = 0; i < length; i++) {
595 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
596 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
597 }
598 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
599 LLVMConstVector(shuffles1, length), "");
600 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
601 LLVMConstVector(shuffles2, length), "");
602 if (type.floating) {
603 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
604 }
605 else {
606 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
607 }
608 length = length >> 1;
609 }
610
611 /* always have vector of size 2 here */
612 assert(length == 1);
613
614 index = lp_build_const_int32(bld->gallivm, 0);
615 res = LLVMBuildExtractElement(builder, vecres, index, "");
616 index = lp_build_const_int32(bld->gallivm, 1);
617 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
618
619 if (type.floating)
620 res = LLVMBuildFAdd(builder, res, elem2, "");
621 else
622 res = LLVMBuildAdd(builder, res, elem2, "");
623
624 return res;
625 }
626
627 /**
628 * Return the horizontal sums of 4 float vectors as a float4 vector.
629 * This uses the technique as outlined in Intel Optimization Manual.
630 */
631 static LLVMValueRef
632 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
633 LLVMValueRef src[4])
634 {
635 struct gallivm_state *gallivm = bld->gallivm;
636 LLVMBuilderRef builder = gallivm->builder;
637 LLVMValueRef shuffles[4];
638 LLVMValueRef tmp[4];
639 LLVMValueRef sumtmp[2], shuftmp[2];
640
641 /* lower half of regs */
642 shuffles[0] = lp_build_const_int32(gallivm, 0);
643 shuffles[1] = lp_build_const_int32(gallivm, 1);
644 shuffles[2] = lp_build_const_int32(gallivm, 4);
645 shuffles[3] = lp_build_const_int32(gallivm, 5);
646 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
647 LLVMConstVector(shuffles, 4), "");
648 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
649 LLVMConstVector(shuffles, 4), "");
650
651 /* upper half of regs */
652 shuffles[0] = lp_build_const_int32(gallivm, 2);
653 shuffles[1] = lp_build_const_int32(gallivm, 3);
654 shuffles[2] = lp_build_const_int32(gallivm, 6);
655 shuffles[3] = lp_build_const_int32(gallivm, 7);
656 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
657 LLVMConstVector(shuffles, 4), "");
658 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
659 LLVMConstVector(shuffles, 4), "");
660
661 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
662 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
663
664 shuffles[0] = lp_build_const_int32(gallivm, 0);
665 shuffles[1] = lp_build_const_int32(gallivm, 2);
666 shuffles[2] = lp_build_const_int32(gallivm, 4);
667 shuffles[3] = lp_build_const_int32(gallivm, 6);
668 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
669 LLVMConstVector(shuffles, 4), "");
670
671 shuffles[0] = lp_build_const_int32(gallivm, 1);
672 shuffles[1] = lp_build_const_int32(gallivm, 3);
673 shuffles[2] = lp_build_const_int32(gallivm, 5);
674 shuffles[3] = lp_build_const_int32(gallivm, 7);
675 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
676 LLVMConstVector(shuffles, 4), "");
677
678 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
679 }
680
681
682 /*
683 * partially horizontally add 2-4 float vectors with length nx4,
684 * i.e. only four adjacent values in each vector will be added,
685 * assuming values are really grouped in 4 which also determines
686 * output order.
687 *
688 * Return a vector of the same length as the initial vectors,
689 * with the excess elements (if any) being undefined.
690 * The element order is independent of number of input vectors.
691 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
692 * the output order thus will be
693 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
694 */
695 LLVMValueRef
696 lp_build_hadd_partial4(struct lp_build_context *bld,
697 LLVMValueRef vectors[],
698 unsigned num_vecs)
699 {
700 struct gallivm_state *gallivm = bld->gallivm;
701 LLVMBuilderRef builder = gallivm->builder;
702 LLVMValueRef ret_vec;
703 LLVMValueRef tmp[4];
704 const char *intrinsic = NULL;
705
706 assert(num_vecs >= 2 && num_vecs <= 4);
707 assert(bld->type.floating);
708
709 /* only use this with at least 2 vectors, as it is sort of expensive
710 * (depending on cpu) and we always need two horizontal adds anyway,
711 * so a shuffle/add approach might be better.
712 */
713
714 tmp[0] = vectors[0];
715 tmp[1] = vectors[1];
716
717 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
718 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
719
720 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
721 bld->type.length == 4) {
722 intrinsic = "llvm.x86.sse3.hadd.ps";
723 }
724 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
725 bld->type.length == 8) {
726 intrinsic = "llvm.x86.avx.hadd.ps.256";
727 }
728 if (intrinsic) {
729 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
730 lp_build_vec_type(gallivm, bld->type),
731 tmp[0], tmp[1]);
732 if (num_vecs > 2) {
733 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
734 lp_build_vec_type(gallivm, bld->type),
735 tmp[2], tmp[3]);
736 }
737 else {
738 tmp[1] = tmp[0];
739 }
740 return lp_build_intrinsic_binary(builder, intrinsic,
741 lp_build_vec_type(gallivm, bld->type),
742 tmp[0], tmp[1]);
743 }
744
745 if (bld->type.length == 4) {
746 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
747 }
748 else {
749 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
750 unsigned j;
751 unsigned num_iter = bld->type.length / 4;
752 struct lp_type parttype = bld->type;
753 parttype.length = 4;
754 for (j = 0; j < num_iter; j++) {
755 LLVMValueRef partsrc[4];
756 unsigned i;
757 for (i = 0; i < 4; i++) {
758 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
759 }
760 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
761 }
762 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
763 }
764 return ret_vec;
765 }
766
767 /**
768 * Generate a - b
769 */
770 LLVMValueRef
771 lp_build_sub(struct lp_build_context *bld,
772 LLVMValueRef a,
773 LLVMValueRef b)
774 {
775 LLVMBuilderRef builder = bld->gallivm->builder;
776 const struct lp_type type = bld->type;
777 LLVMValueRef res;
778
779 assert(lp_check_value(type, a));
780 assert(lp_check_value(type, b));
781
782 if (b == bld->zero)
783 return a;
784 if (a == bld->undef || b == bld->undef)
785 return bld->undef;
786 if (a == b)
787 return bld->zero;
788
789 if (type.norm) {
790 const char *intrinsic = NULL;
791
792 if (!type.sign && b == bld->one)
793 return bld->zero;
794
795 if (!type.floating && !type.fixed) {
796 if (LLVM_VERSION_MAJOR >= 9) {
797 char intrin[32];
798 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
799 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
800 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
801 }
802 if (type.width * type.length == 128) {
803 if (util_cpu_caps.has_sse2) {
804 if (type.width == 8)
805 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
806 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.b" : NULL;
807 if (type.width == 16)
808 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
809 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.w" : NULL;
810 } else if (util_cpu_caps.has_altivec) {
811 if (type.width == 8)
812 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
813 if (type.width == 16)
814 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
815 }
816 }
817 if (type.width * type.length == 256) {
818 if (util_cpu_caps.has_avx2) {
819 if (type.width == 8)
820 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
821 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.b" : NULL;
822 if (type.width == 16)
823 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
824 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.w" : NULL;
825 }
826 }
827 }
828
829 if (intrinsic)
830 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
831 }
832
833 if(type.norm && !type.floating && !type.fixed) {
834 if (type.sign) {
835 uint64_t sign = (uint64_t)1 << (type.width - 1);
836 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
837 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
838 /* a_clamp_max is the maximum a for negative b,
839 a_clamp_min is the minimum a for positive b. */
840 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
841 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
842 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
843 } else {
844 /*
845 * This must match llvm pattern for saturated unsigned sub.
846 * (lp_build_max_simple actually does the job with its current
847 * definition but do it explicitly here.)
848 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
849 * interfere with llvm's ability to recognize the pattern but seems
850 * a bit brittle.
851 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
852 */
853 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
854 a = lp_build_select(bld, no_ov, a, b);
855 }
856 }
857
858 if(LLVMIsConstant(a) && LLVMIsConstant(b))
859 if (type.floating)
860 res = LLVMConstFSub(a, b);
861 else
862 res = LLVMConstSub(a, b);
863 else
864 if (type.floating)
865 res = LLVMBuildFSub(builder, a, b, "");
866 else
867 res = LLVMBuildSub(builder, a, b, "");
868
869 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
870 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
871
872 return res;
873 }
874
875
876
877 /**
878 * Normalized multiplication.
879 *
880 * There are several approaches for (using 8-bit normalized multiplication as
881 * an example):
882 *
883 * - alpha plus one
884 *
885 * makes the following approximation to the division (Sree)
886 *
887 * a*b/255 ~= (a*(b + 1)) >> 256
888 *
889 * which is the fastest method that satisfies the following OpenGL criteria of
890 *
891 * 0*0 = 0 and 255*255 = 255
892 *
893 * - geometric series
894 *
895 * takes the geometric series approximation to the division
896 *
897 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
898 *
899 * in this case just the first two terms to fit in 16bit arithmetic
900 *
901 * t/255 ~= (t + (t >> 8)) >> 8
902 *
903 * note that just by itself it doesn't satisfies the OpenGL criteria, as
904 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
905 * must be used.
906 *
907 * - geometric series plus rounding
908 *
909 * when using a geometric series division instead of truncating the result
910 * use roundoff in the approximation (Jim Blinn)
911 *
912 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
913 *
914 * achieving the exact results.
915 *
916 *
917 *
918 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
919 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
920 * @sa Michael Herf, The "double blend trick", May 2000,
921 * http://www.stereopsis.com/doubleblend.html
922 */
923 LLVMValueRef
924 lp_build_mul_norm(struct gallivm_state *gallivm,
925 struct lp_type wide_type,
926 LLVMValueRef a, LLVMValueRef b)
927 {
928 LLVMBuilderRef builder = gallivm->builder;
929 struct lp_build_context bld;
930 unsigned n;
931 LLVMValueRef half;
932 LLVMValueRef ab;
933
934 assert(!wide_type.floating);
935 assert(lp_check_value(wide_type, a));
936 assert(lp_check_value(wide_type, b));
937
938 lp_build_context_init(&bld, gallivm, wide_type);
939
940 n = wide_type.width / 2;
941 if (wide_type.sign) {
942 --n;
943 }
944
945 /*
946 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
947 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
948 */
949
950 /*
951 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
952 */
953
954 ab = LLVMBuildMul(builder, a, b, "");
955 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
956
957 /*
958 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
959 */
960
961 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
962 if (wide_type.sign) {
963 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
964 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
965 half = lp_build_select(&bld, sign, minus_half, half);
966 }
967 ab = LLVMBuildAdd(builder, ab, half, "");
968
969 /* Final division */
970 ab = lp_build_shr_imm(&bld, ab, n);
971
972 return ab;
973 }
974
975 /**
976 * Generate a * b
977 */
978 LLVMValueRef
979 lp_build_mul(struct lp_build_context *bld,
980 LLVMValueRef a,
981 LLVMValueRef b)
982 {
983 LLVMBuilderRef builder = bld->gallivm->builder;
984 const struct lp_type type = bld->type;
985 LLVMValueRef shift;
986 LLVMValueRef res;
987
988 assert(lp_check_value(type, a));
989 assert(lp_check_value(type, b));
990
991 if(a == bld->zero)
992 return bld->zero;
993 if(a == bld->one)
994 return b;
995 if(b == bld->zero)
996 return bld->zero;
997 if(b == bld->one)
998 return a;
999 if(a == bld->undef || b == bld->undef)
1000 return bld->undef;
1001
1002 if (!type.floating && !type.fixed && type.norm) {
1003 struct lp_type wide_type = lp_wider_type(type);
1004 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1005
1006 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1007 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1008
1009 /* PMULLW, PSRLW, PADDW */
1010 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1011 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1012
1013 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1014
1015 return ab;
1016 }
1017
1018 if(type.fixed)
1019 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1020 else
1021 shift = NULL;
1022
1023 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1024 if (type.floating)
1025 res = LLVMConstFMul(a, b);
1026 else
1027 res = LLVMConstMul(a, b);
1028 if(shift) {
1029 if(type.sign)
1030 res = LLVMConstAShr(res, shift);
1031 else
1032 res = LLVMConstLShr(res, shift);
1033 }
1034 }
1035 else {
1036 if (type.floating)
1037 res = LLVMBuildFMul(builder, a, b, "");
1038 else
1039 res = LLVMBuildMul(builder, a, b, "");
1040 if(shift) {
1041 if(type.sign)
1042 res = LLVMBuildAShr(builder, res, shift, "");
1043 else
1044 res = LLVMBuildLShr(builder, res, shift, "");
1045 }
1046 }
1047
1048 return res;
1049 }
1050
1051 /*
1052 * Widening mul, valid for 32x32 bit -> 64bit only.
1053 * Result is low 32bits, high bits returned in res_hi.
1054 *
1055 * Emits code that is meant to be compiled for the host CPU.
1056 */
1057 LLVMValueRef
1058 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1059 LLVMValueRef a,
1060 LLVMValueRef b,
1061 LLVMValueRef *res_hi)
1062 {
1063 struct gallivm_state *gallivm = bld->gallivm;
1064 LLVMBuilderRef builder = gallivm->builder;
1065
1066 assert(bld->type.width == 32);
1067 assert(bld->type.floating == 0);
1068 assert(bld->type.fixed == 0);
1069 assert(bld->type.norm == 0);
1070
1071 /*
1072 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1073 * for x86 simd is atrocious (even if the high bits weren't required),
1074 * trying to handle real 64bit inputs (which of course can't happen due
1075 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1076 * apparently llvm does not recognize this widening mul). This includes 6
1077 * (instead of 2) pmuludq plus extra adds and shifts
1078 * The same story applies to signed mul, albeit fixing this requires sse41.
1079 * https://llvm.org/bugs/show_bug.cgi?id=30845
1080 * So, whip up our own code, albeit only for length 4 and 8 (which
1081 * should be good enough)...
1082 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1083 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1084 * for signed), which the fallback code does not, without this llvm
1085 * will likely still produce atrocious code.
1086 */
1087 if (LLVM_VERSION_MAJOR < 7 &&
1088 (bld->type.length == 4 || bld->type.length == 8) &&
1089 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1090 util_cpu_caps.has_sse4_1)) {
1091 const char *intrinsic = NULL;
1092 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1093 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1094 struct lp_type type_wide = lp_wider_type(bld->type);
1095 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1096 unsigned i;
1097 for (i = 0; i < bld->type.length; i += 2) {
1098 shuf[i] = lp_build_const_int32(gallivm, i+1);
1099 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1100 }
1101 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1102 aeven = a;
1103 beven = b;
1104 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1105 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1106
1107 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1108 if (bld->type.sign) {
1109 intrinsic = "llvm.x86.avx2.pmul.dq";
1110 } else {
1111 intrinsic = "llvm.x86.avx2.pmulu.dq";
1112 }
1113 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1114 wider_type, aeven, beven);
1115 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1116 wider_type, aodd, bodd);
1117 }
1118 else {
1119 /* for consistent naming look elsewhere... */
1120 if (bld->type.sign) {
1121 intrinsic = "llvm.x86.sse41.pmuldq";
1122 } else {
1123 intrinsic = "llvm.x86.sse2.pmulu.dq";
1124 }
1125 /*
1126 * XXX If we only have AVX but not AVX2 this is a pain.
1127 * lp_build_intrinsic_binary_anylength() can't handle it
1128 * (due to src and dst type not being identical).
1129 */
1130 if (bld->type.length == 8) {
1131 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1132 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1133 LLVMValueRef muleven2[2], mulodd2[2];
1134 struct lp_type type_wide_half = type_wide;
1135 LLVMTypeRef wtype_half;
1136 type_wide_half.length = 2;
1137 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1138 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1139 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1140 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1141 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1142 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1143 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1144 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1145 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1146 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1147 wtype_half, aevenlo, bevenlo);
1148 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1149 wtype_half, aoddlo, boddlo);
1150 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1151 wtype_half, aevenhi, bevenhi);
1152 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1153 wtype_half, aoddhi, boddhi);
1154 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1155 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1156
1157 }
1158 else {
1159 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1160 wider_type, aeven, beven);
1161 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1162 wider_type, aodd, bodd);
1163 }
1164 }
1165 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1166 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1167
1168 for (i = 0; i < bld->type.length; i += 2) {
1169 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1170 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1171 }
1172 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1173 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1174
1175 for (i = 0; i < bld->type.length; i += 2) {
1176 shuf[i] = lp_build_const_int32(gallivm, i);
1177 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1178 }
1179 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1180 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1181 }
1182 else {
1183 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1184 }
1185 }
1186
1187
1188 /*
1189 * Widening mul, valid for 32x32 bit -> 64bit only.
1190 * Result is low 32bits, high bits returned in res_hi.
1191 *
1192 * Emits generic code.
1193 */
1194 LLVMValueRef
1195 lp_build_mul_32_lohi(struct lp_build_context *bld,
1196 LLVMValueRef a,
1197 LLVMValueRef b,
1198 LLVMValueRef *res_hi)
1199 {
1200 struct gallivm_state *gallivm = bld->gallivm;
1201 LLVMBuilderRef builder = gallivm->builder;
1202 LLVMValueRef tmp, shift, res_lo;
1203 struct lp_type type_tmp;
1204 LLVMTypeRef wide_type, narrow_type;
1205
1206 type_tmp = bld->type;
1207 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1208 type_tmp.width *= 2;
1209 wide_type = lp_build_vec_type(gallivm, type_tmp);
1210 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1211
1212 if (bld->type.sign) {
1213 a = LLVMBuildSExt(builder, a, wide_type, "");
1214 b = LLVMBuildSExt(builder, b, wide_type, "");
1215 } else {
1216 a = LLVMBuildZExt(builder, a, wide_type, "");
1217 b = LLVMBuildZExt(builder, b, wide_type, "");
1218 }
1219 tmp = LLVMBuildMul(builder, a, b, "");
1220
1221 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1222
1223 /* Since we truncate anyway, LShr and AShr are equivalent. */
1224 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1225 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1226
1227 return res_lo;
1228 }
1229
1230
1231 /* a * b + c */
1232 LLVMValueRef
1233 lp_build_mad(struct lp_build_context *bld,
1234 LLVMValueRef a,
1235 LLVMValueRef b,
1236 LLVMValueRef c)
1237 {
1238 const struct lp_type type = bld->type;
1239 if (type.floating) {
1240 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1241 } else {
1242 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1243 }
1244 }
1245
1246
1247 /**
1248 * Small vector x scale multiplication optimization.
1249 */
1250 LLVMValueRef
1251 lp_build_mul_imm(struct lp_build_context *bld,
1252 LLVMValueRef a,
1253 int b)
1254 {
1255 LLVMBuilderRef builder = bld->gallivm->builder;
1256 LLVMValueRef factor;
1257
1258 assert(lp_check_value(bld->type, a));
1259
1260 if(b == 0)
1261 return bld->zero;
1262
1263 if(b == 1)
1264 return a;
1265
1266 if(b == -1)
1267 return lp_build_negate(bld, a);
1268
1269 if(b == 2 && bld->type.floating)
1270 return lp_build_add(bld, a, a);
1271
1272 if(util_is_power_of_two_or_zero(b)) {
1273 unsigned shift = ffs(b) - 1;
1274
1275 if(bld->type.floating) {
1276 #if 0
1277 /*
1278 * Power of two multiplication by directly manipulating the exponent.
1279 *
1280 * XXX: This might not be always faster, it will introduce a small error
1281 * for multiplication by zero, and it will produce wrong results
1282 * for Inf and NaN.
1283 */
1284 unsigned mantissa = lp_mantissa(bld->type);
1285 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1286 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1287 a = LLVMBuildAdd(builder, a, factor, "");
1288 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1289 return a;
1290 #endif
1291 }
1292 else {
1293 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1294 return LLVMBuildShl(builder, a, factor, "");
1295 }
1296 }
1297
1298 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1299 return lp_build_mul(bld, a, factor);
1300 }
1301
1302
1303 /**
1304 * Generate a / b
1305 */
1306 LLVMValueRef
1307 lp_build_div(struct lp_build_context *bld,
1308 LLVMValueRef a,
1309 LLVMValueRef b)
1310 {
1311 LLVMBuilderRef builder = bld->gallivm->builder;
1312 const struct lp_type type = bld->type;
1313
1314 assert(lp_check_value(type, a));
1315 assert(lp_check_value(type, b));
1316
1317 if(a == bld->zero)
1318 return bld->zero;
1319 if(a == bld->one && type.floating)
1320 return lp_build_rcp(bld, b);
1321 if(b == bld->zero)
1322 return bld->undef;
1323 if(b == bld->one)
1324 return a;
1325 if(a == bld->undef || b == bld->undef)
1326 return bld->undef;
1327
1328 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1329 if (type.floating)
1330 return LLVMConstFDiv(a, b);
1331 else if (type.sign)
1332 return LLVMConstSDiv(a, b);
1333 else
1334 return LLVMConstUDiv(a, b);
1335 }
1336
1337 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1338 if(FALSE &&
1339 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1340 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1341 type.floating)
1342 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1343
1344 if (type.floating)
1345 return LLVMBuildFDiv(builder, a, b, "");
1346 else if (type.sign)
1347 return LLVMBuildSDiv(builder, a, b, "");
1348 else
1349 return LLVMBuildUDiv(builder, a, b, "");
1350 }
1351
1352
1353 /**
1354 * Linear interpolation helper.
1355 *
1356 * @param normalized whether we are interpolating normalized values,
1357 * encoded in normalized integers, twice as wide.
1358 *
1359 * @sa http://www.stereopsis.com/doubleblend.html
1360 */
1361 static inline LLVMValueRef
1362 lp_build_lerp_simple(struct lp_build_context *bld,
1363 LLVMValueRef x,
1364 LLVMValueRef v0,
1365 LLVMValueRef v1,
1366 unsigned flags)
1367 {
1368 unsigned half_width = bld->type.width/2;
1369 LLVMBuilderRef builder = bld->gallivm->builder;
1370 LLVMValueRef delta;
1371 LLVMValueRef res;
1372
1373 assert(lp_check_value(bld->type, x));
1374 assert(lp_check_value(bld->type, v0));
1375 assert(lp_check_value(bld->type, v1));
1376
1377 delta = lp_build_sub(bld, v1, v0);
1378
1379 if (bld->type.floating) {
1380 assert(flags == 0);
1381 return lp_build_mad(bld, x, delta, v0);
1382 }
1383
1384 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1385 if (!bld->type.sign) {
1386 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1387 /*
1388 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1389 * most-significant-bit to the lowest-significant-bit, so that
1390 * later we can just divide by 2**n instead of 2**n - 1.
1391 */
1392
1393 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1394 }
1395
1396 /* (x * delta) >> n */
1397 res = lp_build_mul(bld, x, delta);
1398 res = lp_build_shr_imm(bld, res, half_width);
1399 } else {
1400 /*
1401 * The rescaling trick above doesn't work for signed numbers, so
1402 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1403 * instead.
1404 */
1405 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1406 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1407 }
1408 } else {
1409 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1410 res = lp_build_mul(bld, x, delta);
1411 }
1412
1413 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1414 /*
1415 * At this point both res and v0 only use the lower half of the bits,
1416 * the rest is zero. Instead of add / mask, do add with half wide type.
1417 */
1418 struct lp_type narrow_type;
1419 struct lp_build_context narrow_bld;
1420
1421 memset(&narrow_type, 0, sizeof narrow_type);
1422 narrow_type.sign = bld->type.sign;
1423 narrow_type.width = bld->type.width/2;
1424 narrow_type.length = bld->type.length*2;
1425
1426 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1427 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1428 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1429 res = lp_build_add(&narrow_bld, v0, res);
1430 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1431 } else {
1432 res = lp_build_add(bld, v0, res);
1433
1434 if (bld->type.fixed) {
1435 /*
1436 * We need to mask out the high order bits when lerping 8bit
1437 * normalized colors stored on 16bits
1438 */
1439 /* XXX: This step is necessary for lerping 8bit colors stored on
1440 * 16bits, but it will be wrong for true fixed point use cases.
1441 * Basically we need a more powerful lp_type, capable of further
1442 * distinguishing the values interpretation from the value storage.
1443 */
1444 LLVMValueRef low_bits;
1445 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1446 res = LLVMBuildAnd(builder, res, low_bits, "");
1447 }
1448 }
1449
1450 return res;
1451 }
1452
1453
1454 /**
1455 * Linear interpolation.
1456 */
1457 LLVMValueRef
1458 lp_build_lerp(struct lp_build_context *bld,
1459 LLVMValueRef x,
1460 LLVMValueRef v0,
1461 LLVMValueRef v1,
1462 unsigned flags)
1463 {
1464 const struct lp_type type = bld->type;
1465 LLVMValueRef res;
1466
1467 assert(lp_check_value(type, x));
1468 assert(lp_check_value(type, v0));
1469 assert(lp_check_value(type, v1));
1470
1471 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1472
1473 if (type.norm) {
1474 struct lp_type wide_type;
1475 struct lp_build_context wide_bld;
1476 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1477
1478 assert(type.length >= 2);
1479
1480 /*
1481 * Create a wider integer type, enough to hold the
1482 * intermediate result of the multiplication.
1483 */
1484 memset(&wide_type, 0, sizeof wide_type);
1485 wide_type.sign = type.sign;
1486 wide_type.width = type.width*2;
1487 wide_type.length = type.length/2;
1488
1489 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1490
1491 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1492 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1493 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1494
1495 /*
1496 * Lerp both halves.
1497 */
1498
1499 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1500
1501 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1502 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1503
1504 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1505 } else {
1506 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1507 }
1508
1509 return res;
1510 }
1511
1512
1513 /**
1514 * Bilinear interpolation.
1515 *
1516 * Values indices are in v_{yx}.
1517 */
1518 LLVMValueRef
1519 lp_build_lerp_2d(struct lp_build_context *bld,
1520 LLVMValueRef x,
1521 LLVMValueRef y,
1522 LLVMValueRef v00,
1523 LLVMValueRef v01,
1524 LLVMValueRef v10,
1525 LLVMValueRef v11,
1526 unsigned flags)
1527 {
1528 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1529 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1530 return lp_build_lerp(bld, y, v0, v1, flags);
1531 }
1532
1533
1534 LLVMValueRef
1535 lp_build_lerp_3d(struct lp_build_context *bld,
1536 LLVMValueRef x,
1537 LLVMValueRef y,
1538 LLVMValueRef z,
1539 LLVMValueRef v000,
1540 LLVMValueRef v001,
1541 LLVMValueRef v010,
1542 LLVMValueRef v011,
1543 LLVMValueRef v100,
1544 LLVMValueRef v101,
1545 LLVMValueRef v110,
1546 LLVMValueRef v111,
1547 unsigned flags)
1548 {
1549 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1550 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1551 return lp_build_lerp(bld, z, v0, v1, flags);
1552 }
1553
1554
1555 /**
1556 * Generate min(a, b)
1557 * Do checks for special cases but not for nans.
1558 */
1559 LLVMValueRef
1560 lp_build_min(struct lp_build_context *bld,
1561 LLVMValueRef a,
1562 LLVMValueRef b)
1563 {
1564 assert(lp_check_value(bld->type, a));
1565 assert(lp_check_value(bld->type, b));
1566
1567 if(a == bld->undef || b == bld->undef)
1568 return bld->undef;
1569
1570 if(a == b)
1571 return a;
1572
1573 if (bld->type.norm) {
1574 if (!bld->type.sign) {
1575 if (a == bld->zero || b == bld->zero) {
1576 return bld->zero;
1577 }
1578 }
1579 if(a == bld->one)
1580 return b;
1581 if(b == bld->one)
1582 return a;
1583 }
1584
1585 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1586 }
1587
1588
1589 /**
1590 * Generate min(a, b)
1591 * NaN's are handled according to the behavior specified by the
1592 * nan_behavior argument.
1593 */
1594 LLVMValueRef
1595 lp_build_min_ext(struct lp_build_context *bld,
1596 LLVMValueRef a,
1597 LLVMValueRef b,
1598 enum gallivm_nan_behavior nan_behavior)
1599 {
1600 assert(lp_check_value(bld->type, a));
1601 assert(lp_check_value(bld->type, b));
1602
1603 if(a == bld->undef || b == bld->undef)
1604 return bld->undef;
1605
1606 if(a == b)
1607 return a;
1608
1609 if (bld->type.norm) {
1610 if (!bld->type.sign) {
1611 if (a == bld->zero || b == bld->zero) {
1612 return bld->zero;
1613 }
1614 }
1615 if(a == bld->one)
1616 return b;
1617 if(b == bld->one)
1618 return a;
1619 }
1620
1621 return lp_build_min_simple(bld, a, b, nan_behavior);
1622 }
1623
1624 /**
1625 * Generate max(a, b)
1626 * Do checks for special cases, but NaN behavior is undefined.
1627 */
1628 LLVMValueRef
1629 lp_build_max(struct lp_build_context *bld,
1630 LLVMValueRef a,
1631 LLVMValueRef b)
1632 {
1633 assert(lp_check_value(bld->type, a));
1634 assert(lp_check_value(bld->type, b));
1635
1636 if(a == bld->undef || b == bld->undef)
1637 return bld->undef;
1638
1639 if(a == b)
1640 return a;
1641
1642 if(bld->type.norm) {
1643 if(a == bld->one || b == bld->one)
1644 return bld->one;
1645 if (!bld->type.sign) {
1646 if (a == bld->zero) {
1647 return b;
1648 }
1649 if (b == bld->zero) {
1650 return a;
1651 }
1652 }
1653 }
1654
1655 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1656 }
1657
1658
1659 /**
1660 * Generate max(a, b)
1661 * Checks for special cases.
1662 * NaN's are handled according to the behavior specified by the
1663 * nan_behavior argument.
1664 */
1665 LLVMValueRef
1666 lp_build_max_ext(struct lp_build_context *bld,
1667 LLVMValueRef a,
1668 LLVMValueRef b,
1669 enum gallivm_nan_behavior nan_behavior)
1670 {
1671 assert(lp_check_value(bld->type, a));
1672 assert(lp_check_value(bld->type, b));
1673
1674 if(a == bld->undef || b == bld->undef)
1675 return bld->undef;
1676
1677 if(a == b)
1678 return a;
1679
1680 if(bld->type.norm) {
1681 if(a == bld->one || b == bld->one)
1682 return bld->one;
1683 if (!bld->type.sign) {
1684 if (a == bld->zero) {
1685 return b;
1686 }
1687 if (b == bld->zero) {
1688 return a;
1689 }
1690 }
1691 }
1692
1693 return lp_build_max_simple(bld, a, b, nan_behavior);
1694 }
1695
1696 /**
1697 * Generate clamp(a, min, max)
1698 * NaN behavior (for any of a, min, max) is undefined.
1699 * Do checks for special cases.
1700 */
1701 LLVMValueRef
1702 lp_build_clamp(struct lp_build_context *bld,
1703 LLVMValueRef a,
1704 LLVMValueRef min,
1705 LLVMValueRef max)
1706 {
1707 assert(lp_check_value(bld->type, a));
1708 assert(lp_check_value(bld->type, min));
1709 assert(lp_check_value(bld->type, max));
1710
1711 a = lp_build_min(bld, a, max);
1712 a = lp_build_max(bld, a, min);
1713 return a;
1714 }
1715
1716
1717 /**
1718 * Generate clamp(a, 0, 1)
1719 * A NaN will get converted to zero.
1720 */
1721 LLVMValueRef
1722 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1723 LLVMValueRef a)
1724 {
1725 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1726 a = lp_build_min(bld, a, bld->one);
1727 return a;
1728 }
1729
1730
1731 /**
1732 * Generate abs(a)
1733 */
1734 LLVMValueRef
1735 lp_build_abs(struct lp_build_context *bld,
1736 LLVMValueRef a)
1737 {
1738 LLVMBuilderRef builder = bld->gallivm->builder;
1739 const struct lp_type type = bld->type;
1740 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1741
1742 assert(lp_check_value(type, a));
1743
1744 if(!type.sign)
1745 return a;
1746
1747 if(type.floating) {
1748 char intrinsic[32];
1749 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1750 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1751 }
1752
1753 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1754 switch(type.width) {
1755 case 8:
1756 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1757 case 16:
1758 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1759 case 32:
1760 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1761 }
1762 }
1763 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1764 switch(type.width) {
1765 case 8:
1766 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1767 case 16:
1768 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1769 case 32:
1770 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1771 }
1772 }
1773
1774 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1775 a, LLVMBuildNeg(builder, a, ""));
1776 }
1777
1778
1779 LLVMValueRef
1780 lp_build_negate(struct lp_build_context *bld,
1781 LLVMValueRef a)
1782 {
1783 LLVMBuilderRef builder = bld->gallivm->builder;
1784
1785 assert(lp_check_value(bld->type, a));
1786
1787 if (bld->type.floating)
1788 a = LLVMBuildFNeg(builder, a, "");
1789 else
1790 a = LLVMBuildNeg(builder, a, "");
1791
1792 return a;
1793 }
1794
1795
1796 /** Return -1, 0 or +1 depending on the sign of a */
1797 LLVMValueRef
1798 lp_build_sgn(struct lp_build_context *bld,
1799 LLVMValueRef a)
1800 {
1801 LLVMBuilderRef builder = bld->gallivm->builder;
1802 const struct lp_type type = bld->type;
1803 LLVMValueRef cond;
1804 LLVMValueRef res;
1805
1806 assert(lp_check_value(type, a));
1807
1808 /* Handle non-zero case */
1809 if(!type.sign) {
1810 /* if not zero then sign must be positive */
1811 res = bld->one;
1812 }
1813 else if(type.floating) {
1814 LLVMTypeRef vec_type;
1815 LLVMTypeRef int_type;
1816 LLVMValueRef mask;
1817 LLVMValueRef sign;
1818 LLVMValueRef one;
1819 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1820
1821 int_type = lp_build_int_vec_type(bld->gallivm, type);
1822 vec_type = lp_build_vec_type(bld->gallivm, type);
1823 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1824
1825 /* Take the sign bit and add it to 1 constant */
1826 sign = LLVMBuildBitCast(builder, a, int_type, "");
1827 sign = LLVMBuildAnd(builder, sign, mask, "");
1828 one = LLVMConstBitCast(bld->one, int_type);
1829 res = LLVMBuildOr(builder, sign, one, "");
1830 res = LLVMBuildBitCast(builder, res, vec_type, "");
1831 }
1832 else
1833 {
1834 /* signed int/norm/fixed point */
1835 /* could use psign with sse3 and appropriate vectors here */
1836 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1837 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1838 res = lp_build_select(bld, cond, bld->one, minus_one);
1839 }
1840
1841 /* Handle zero */
1842 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1843 res = lp_build_select(bld, cond, bld->zero, res);
1844
1845 return res;
1846 }
1847
1848
1849 /**
1850 * Set the sign of float vector 'a' according to 'sign'.
1851 * If sign==0, return abs(a).
1852 * If sign==1, return -abs(a);
1853 * Other values for sign produce undefined results.
1854 */
1855 LLVMValueRef
1856 lp_build_set_sign(struct lp_build_context *bld,
1857 LLVMValueRef a, LLVMValueRef sign)
1858 {
1859 LLVMBuilderRef builder = bld->gallivm->builder;
1860 const struct lp_type type = bld->type;
1861 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1862 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1863 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1864 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1865 ~((unsigned long long) 1 << (type.width - 1)));
1866 LLVMValueRef val, res;
1867
1868 assert(type.floating);
1869 assert(lp_check_value(type, a));
1870
1871 /* val = reinterpret_cast<int>(a) */
1872 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1873 /* val = val & mask */
1874 val = LLVMBuildAnd(builder, val, mask, "");
1875 /* sign = sign << shift */
1876 sign = LLVMBuildShl(builder, sign, shift, "");
1877 /* res = val | sign */
1878 res = LLVMBuildOr(builder, val, sign, "");
1879 /* res = reinterpret_cast<float>(res) */
1880 res = LLVMBuildBitCast(builder, res, vec_type, "");
1881
1882 return res;
1883 }
1884
1885
1886 /**
1887 * Convert vector of (or scalar) int to vector of (or scalar) float.
1888 */
1889 LLVMValueRef
1890 lp_build_int_to_float(struct lp_build_context *bld,
1891 LLVMValueRef a)
1892 {
1893 LLVMBuilderRef builder = bld->gallivm->builder;
1894 const struct lp_type type = bld->type;
1895 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1896
1897 assert(type.floating);
1898
1899 return LLVMBuildSIToFP(builder, a, vec_type, "");
1900 }
1901
1902 static boolean
1903 arch_rounding_available(const struct lp_type type)
1904 {
1905 if ((util_cpu_caps.has_sse4_1 &&
1906 (type.length == 1 || type.width*type.length == 128)) ||
1907 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1908 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1909 return TRUE;
1910 else if ((util_cpu_caps.has_altivec &&
1911 (type.width == 32 && type.length == 4)))
1912 return TRUE;
1913 else if (util_cpu_caps.has_neon)
1914 return TRUE;
1915
1916 return FALSE;
1917 }
1918
1919 enum lp_build_round_mode
1920 {
1921 LP_BUILD_ROUND_NEAREST = 0,
1922 LP_BUILD_ROUND_FLOOR = 1,
1923 LP_BUILD_ROUND_CEIL = 2,
1924 LP_BUILD_ROUND_TRUNCATE = 3
1925 };
1926
1927 static inline LLVMValueRef
1928 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1929 LLVMValueRef a)
1930 {
1931 LLVMBuilderRef builder = bld->gallivm->builder;
1932 const struct lp_type type = bld->type;
1933 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1934 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1935 const char *intrinsic;
1936 LLVMValueRef res;
1937
1938 assert(type.floating);
1939 /* using the double precision conversions is a bit more complicated */
1940 assert(type.width == 32);
1941
1942 assert(lp_check_value(type, a));
1943 assert(util_cpu_caps.has_sse2);
1944
1945 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1946 if (type.length == 1) {
1947 LLVMTypeRef vec_type;
1948 LLVMValueRef undef;
1949 LLVMValueRef arg;
1950 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1951
1952 vec_type = LLVMVectorType(bld->elem_type, 4);
1953
1954 intrinsic = "llvm.x86.sse.cvtss2si";
1955
1956 undef = LLVMGetUndef(vec_type);
1957
1958 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1959
1960 res = lp_build_intrinsic_unary(builder, intrinsic,
1961 ret_type, arg);
1962 }
1963 else {
1964 if (type.width* type.length == 128) {
1965 intrinsic = "llvm.x86.sse2.cvtps2dq";
1966 }
1967 else {
1968 assert(type.width*type.length == 256);
1969 assert(util_cpu_caps.has_avx);
1970
1971 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1972 }
1973 res = lp_build_intrinsic_unary(builder, intrinsic,
1974 ret_type, a);
1975 }
1976
1977 return res;
1978 }
1979
1980
1981 /*
1982 */
1983 static inline LLVMValueRef
1984 lp_build_round_altivec(struct lp_build_context *bld,
1985 LLVMValueRef a,
1986 enum lp_build_round_mode mode)
1987 {
1988 LLVMBuilderRef builder = bld->gallivm->builder;
1989 const struct lp_type type = bld->type;
1990 const char *intrinsic = NULL;
1991
1992 assert(type.floating);
1993
1994 assert(lp_check_value(type, a));
1995 assert(util_cpu_caps.has_altivec);
1996
1997 (void)type;
1998
1999 switch (mode) {
2000 case LP_BUILD_ROUND_NEAREST:
2001 intrinsic = "llvm.ppc.altivec.vrfin";
2002 break;
2003 case LP_BUILD_ROUND_FLOOR:
2004 intrinsic = "llvm.ppc.altivec.vrfim";
2005 break;
2006 case LP_BUILD_ROUND_CEIL:
2007 intrinsic = "llvm.ppc.altivec.vrfip";
2008 break;
2009 case LP_BUILD_ROUND_TRUNCATE:
2010 intrinsic = "llvm.ppc.altivec.vrfiz";
2011 break;
2012 }
2013
2014 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2015 }
2016
2017 static inline LLVMValueRef
2018 lp_build_round_arch(struct lp_build_context *bld,
2019 LLVMValueRef a,
2020 enum lp_build_round_mode mode)
2021 {
2022 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2023 LLVMBuilderRef builder = bld->gallivm->builder;
2024 const struct lp_type type = bld->type;
2025 const char *intrinsic_root;
2026 char intrinsic[32];
2027
2028 assert(type.floating);
2029 assert(lp_check_value(type, a));
2030 (void)type;
2031
2032 switch (mode) {
2033 case LP_BUILD_ROUND_NEAREST:
2034 intrinsic_root = "llvm.nearbyint";
2035 break;
2036 case LP_BUILD_ROUND_FLOOR:
2037 intrinsic_root = "llvm.floor";
2038 break;
2039 case LP_BUILD_ROUND_CEIL:
2040 intrinsic_root = "llvm.ceil";
2041 break;
2042 case LP_BUILD_ROUND_TRUNCATE:
2043 intrinsic_root = "llvm.trunc";
2044 break;
2045 }
2046
2047 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2048 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2049 }
2050 else /* (util_cpu_caps.has_altivec) */
2051 return lp_build_round_altivec(bld, a, mode);
2052 }
2053
2054 /**
2055 * Return the integer part of a float (vector) value (== round toward zero).
2056 * The returned value is a float (vector).
2057 * Ex: trunc(-1.5) = -1.0
2058 */
2059 LLVMValueRef
2060 lp_build_trunc(struct lp_build_context *bld,
2061 LLVMValueRef a)
2062 {
2063 LLVMBuilderRef builder = bld->gallivm->builder;
2064 const struct lp_type type = bld->type;
2065
2066 assert(type.floating);
2067 assert(lp_check_value(type, a));
2068
2069 if (arch_rounding_available(type)) {
2070 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2071 }
2072 else {
2073 const struct lp_type type = bld->type;
2074 struct lp_type inttype;
2075 struct lp_build_context intbld;
2076 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2077 LLVMValueRef trunc, res, anosign, mask;
2078 LLVMTypeRef int_vec_type = bld->int_vec_type;
2079 LLVMTypeRef vec_type = bld->vec_type;
2080
2081 assert(type.width == 32); /* might want to handle doubles at some point */
2082
2083 inttype = type;
2084 inttype.floating = 0;
2085 lp_build_context_init(&intbld, bld->gallivm, inttype);
2086
2087 /* round by truncation */
2088 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2089 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2090
2091 /* mask out sign bit */
2092 anosign = lp_build_abs(bld, a);
2093 /*
2094 * mask out all values if anosign > 2^24
2095 * This should work both for large ints (all rounding is no-op for them
2096 * because such floats are always exact) as well as special cases like
2097 * NaNs, Infs (taking advantage of the fact they use max exponent).
2098 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2099 */
2100 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2101 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2102 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2103 return lp_build_select(bld, mask, a, res);
2104 }
2105 }
2106
2107
2108 /**
2109 * Return float (vector) rounded to nearest integer (vector). The returned
2110 * value is a float (vector).
2111 * Ex: round(0.9) = 1.0
2112 * Ex: round(-1.5) = -2.0
2113 */
2114 LLVMValueRef
2115 lp_build_round(struct lp_build_context *bld,
2116 LLVMValueRef a)
2117 {
2118 LLVMBuilderRef builder = bld->gallivm->builder;
2119 const struct lp_type type = bld->type;
2120
2121 assert(type.floating);
2122 assert(lp_check_value(type, a));
2123
2124 if (arch_rounding_available(type)) {
2125 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2126 }
2127 else {
2128 const struct lp_type type = bld->type;
2129 struct lp_type inttype;
2130 struct lp_build_context intbld;
2131 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2132 LLVMValueRef res, anosign, mask;
2133 LLVMTypeRef int_vec_type = bld->int_vec_type;
2134 LLVMTypeRef vec_type = bld->vec_type;
2135
2136 assert(type.width == 32); /* might want to handle doubles at some point */
2137
2138 inttype = type;
2139 inttype.floating = 0;
2140 lp_build_context_init(&intbld, bld->gallivm, inttype);
2141
2142 res = lp_build_iround(bld, a);
2143 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2144
2145 /* mask out sign bit */
2146 anosign = lp_build_abs(bld, a);
2147 /*
2148 * mask out all values if anosign > 2^24
2149 * This should work both for large ints (all rounding is no-op for them
2150 * because such floats are always exact) as well as special cases like
2151 * NaNs, Infs (taking advantage of the fact they use max exponent).
2152 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2153 */
2154 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2155 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2156 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2157 return lp_build_select(bld, mask, a, res);
2158 }
2159 }
2160
2161
2162 /**
2163 * Return floor of float (vector), result is a float (vector)
2164 * Ex: floor(1.1) = 1.0
2165 * Ex: floor(-1.1) = -2.0
2166 */
2167 LLVMValueRef
2168 lp_build_floor(struct lp_build_context *bld,
2169 LLVMValueRef a)
2170 {
2171 LLVMBuilderRef builder = bld->gallivm->builder;
2172 const struct lp_type type = bld->type;
2173
2174 assert(type.floating);
2175 assert(lp_check_value(type, a));
2176
2177 if (arch_rounding_available(type)) {
2178 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2179 }
2180 else {
2181 const struct lp_type type = bld->type;
2182 struct lp_type inttype;
2183 struct lp_build_context intbld;
2184 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2185 LLVMValueRef trunc, res, anosign, mask;
2186 LLVMTypeRef int_vec_type = bld->int_vec_type;
2187 LLVMTypeRef vec_type = bld->vec_type;
2188
2189 if (type.width != 32) {
2190 char intrinsic[32];
2191 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2192 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2193 }
2194
2195 assert(type.width == 32); /* might want to handle doubles at some point */
2196
2197 inttype = type;
2198 inttype.floating = 0;
2199 lp_build_context_init(&intbld, bld->gallivm, inttype);
2200
2201 /* round by truncation */
2202 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2203 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2204
2205 if (type.sign) {
2206 LLVMValueRef tmp;
2207
2208 /*
2209 * fix values if rounding is wrong (for non-special cases)
2210 * - this is the case if trunc > a
2211 */
2212 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2213 /* tmp = trunc > a ? 1.0 : 0.0 */
2214 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2215 tmp = lp_build_and(&intbld, mask, tmp);
2216 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2217 res = lp_build_sub(bld, res, tmp);
2218 }
2219
2220 /* mask out sign bit */
2221 anosign = lp_build_abs(bld, a);
2222 /*
2223 * mask out all values if anosign > 2^24
2224 * This should work both for large ints (all rounding is no-op for them
2225 * because such floats are always exact) as well as special cases like
2226 * NaNs, Infs (taking advantage of the fact they use max exponent).
2227 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2228 */
2229 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2230 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2231 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2232 return lp_build_select(bld, mask, a, res);
2233 }
2234 }
2235
2236
2237 /**
2238 * Return ceiling of float (vector), returning float (vector).
2239 * Ex: ceil( 1.1) = 2.0
2240 * Ex: ceil(-1.1) = -1.0
2241 */
2242 LLVMValueRef
2243 lp_build_ceil(struct lp_build_context *bld,
2244 LLVMValueRef a)
2245 {
2246 LLVMBuilderRef builder = bld->gallivm->builder;
2247 const struct lp_type type = bld->type;
2248
2249 assert(type.floating);
2250 assert(lp_check_value(type, a));
2251
2252 if (arch_rounding_available(type)) {
2253 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2254 }
2255 else {
2256 const struct lp_type type = bld->type;
2257 struct lp_type inttype;
2258 struct lp_build_context intbld;
2259 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2260 LLVMValueRef trunc, res, anosign, mask, tmp;
2261 LLVMTypeRef int_vec_type = bld->int_vec_type;
2262 LLVMTypeRef vec_type = bld->vec_type;
2263
2264 if (type.width != 32) {
2265 char intrinsic[32];
2266 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2267 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2268 }
2269
2270 assert(type.width == 32); /* might want to handle doubles at some point */
2271
2272 inttype = type;
2273 inttype.floating = 0;
2274 lp_build_context_init(&intbld, bld->gallivm, inttype);
2275
2276 /* round by truncation */
2277 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2278 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2279
2280 /*
2281 * fix values if rounding is wrong (for non-special cases)
2282 * - this is the case if trunc < a
2283 */
2284 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2285 /* tmp = trunc < a ? 1.0 : 0.0 */
2286 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2287 tmp = lp_build_and(&intbld, mask, tmp);
2288 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2289 res = lp_build_add(bld, trunc, tmp);
2290
2291 /* mask out sign bit */
2292 anosign = lp_build_abs(bld, a);
2293 /*
2294 * mask out all values if anosign > 2^24
2295 * This should work both for large ints (all rounding is no-op for them
2296 * because such floats are always exact) as well as special cases like
2297 * NaNs, Infs (taking advantage of the fact they use max exponent).
2298 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2299 */
2300 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2301 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2302 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2303 return lp_build_select(bld, mask, a, res);
2304 }
2305 }
2306
2307
2308 /**
2309 * Return fractional part of 'a' computed as a - floor(a)
2310 * Typically used in texture coord arithmetic.
2311 */
2312 LLVMValueRef
2313 lp_build_fract(struct lp_build_context *bld,
2314 LLVMValueRef a)
2315 {
2316 assert(bld->type.floating);
2317 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2318 }
2319
2320
2321 /**
2322 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2323 * against 0.99999(9). (Will also return that value for NaNs.)
2324 */
2325 static inline LLVMValueRef
2326 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2327 {
2328 LLVMValueRef max;
2329
2330 /* this is the largest number smaller than 1.0 representable as float */
2331 max = lp_build_const_vec(bld->gallivm, bld->type,
2332 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2333 return lp_build_min_ext(bld, fract, max,
2334 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2335 }
2336
2337
2338 /**
2339 * Same as lp_build_fract, but guarantees that the result is always smaller
2340 * than one. Will also return the smaller-than-one value for infs, NaNs.
2341 */
2342 LLVMValueRef
2343 lp_build_fract_safe(struct lp_build_context *bld,
2344 LLVMValueRef a)
2345 {
2346 return clamp_fract(bld, lp_build_fract(bld, a));
2347 }
2348
2349
2350 /**
2351 * Return the integer part of a float (vector) value (== round toward zero).
2352 * The returned value is an integer (vector).
2353 * Ex: itrunc(-1.5) = -1
2354 */
2355 LLVMValueRef
2356 lp_build_itrunc(struct lp_build_context *bld,
2357 LLVMValueRef a)
2358 {
2359 LLVMBuilderRef builder = bld->gallivm->builder;
2360 const struct lp_type type = bld->type;
2361 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2362
2363 assert(type.floating);
2364 assert(lp_check_value(type, a));
2365
2366 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2367 }
2368
2369
2370 /**
2371 * Return float (vector) rounded to nearest integer (vector). The returned
2372 * value is an integer (vector).
2373 * Ex: iround(0.9) = 1
2374 * Ex: iround(-1.5) = -2
2375 */
2376 LLVMValueRef
2377 lp_build_iround(struct lp_build_context *bld,
2378 LLVMValueRef a)
2379 {
2380 LLVMBuilderRef builder = bld->gallivm->builder;
2381 const struct lp_type type = bld->type;
2382 LLVMTypeRef int_vec_type = bld->int_vec_type;
2383 LLVMValueRef res;
2384
2385 assert(type.floating);
2386
2387 assert(lp_check_value(type, a));
2388
2389 if ((util_cpu_caps.has_sse2 &&
2390 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2391 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2392 return lp_build_iround_nearest_sse2(bld, a);
2393 }
2394 if (arch_rounding_available(type)) {
2395 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2396 }
2397 else {
2398 LLVMValueRef half;
2399
2400 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2401
2402 if (type.sign) {
2403 LLVMTypeRef vec_type = bld->vec_type;
2404 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2405 (unsigned long long)1 << (type.width - 1));
2406 LLVMValueRef sign;
2407
2408 /* get sign bit */
2409 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2410 sign = LLVMBuildAnd(builder, sign, mask, "");
2411
2412 /* sign * 0.5 */
2413 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2414 half = LLVMBuildOr(builder, sign, half, "");
2415 half = LLVMBuildBitCast(builder, half, vec_type, "");
2416 }
2417
2418 res = LLVMBuildFAdd(builder, a, half, "");
2419 }
2420
2421 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2422
2423 return res;
2424 }
2425
2426
2427 /**
2428 * Return floor of float (vector), result is an int (vector)
2429 * Ex: ifloor(1.1) = 1.0
2430 * Ex: ifloor(-1.1) = -2.0
2431 */
2432 LLVMValueRef
2433 lp_build_ifloor(struct lp_build_context *bld,
2434 LLVMValueRef a)
2435 {
2436 LLVMBuilderRef builder = bld->gallivm->builder;
2437 const struct lp_type type = bld->type;
2438 LLVMTypeRef int_vec_type = bld->int_vec_type;
2439 LLVMValueRef res;
2440
2441 assert(type.floating);
2442 assert(lp_check_value(type, a));
2443
2444 res = a;
2445 if (type.sign) {
2446 if (arch_rounding_available(type)) {
2447 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2448 }
2449 else {
2450 struct lp_type inttype;
2451 struct lp_build_context intbld;
2452 LLVMValueRef trunc, itrunc, mask;
2453
2454 assert(type.floating);
2455 assert(lp_check_value(type, a));
2456
2457 inttype = type;
2458 inttype.floating = 0;
2459 lp_build_context_init(&intbld, bld->gallivm, inttype);
2460
2461 /* round by truncation */
2462 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2463 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2464
2465 /*
2466 * fix values if rounding is wrong (for non-special cases)
2467 * - this is the case if trunc > a
2468 * The results of doing this with NaNs, very large values etc.
2469 * are undefined but this seems to be the case anyway.
2470 */
2471 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2472 /* cheapie minus one with mask since the mask is minus one / zero */
2473 return lp_build_add(&intbld, itrunc, mask);
2474 }
2475 }
2476
2477 /* round to nearest (toward zero) */
2478 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2479
2480 return res;
2481 }
2482
2483
2484 /**
2485 * Return ceiling of float (vector), returning int (vector).
2486 * Ex: iceil( 1.1) = 2
2487 * Ex: iceil(-1.1) = -1
2488 */
2489 LLVMValueRef
2490 lp_build_iceil(struct lp_build_context *bld,
2491 LLVMValueRef a)
2492 {
2493 LLVMBuilderRef builder = bld->gallivm->builder;
2494 const struct lp_type type = bld->type;
2495 LLVMTypeRef int_vec_type = bld->int_vec_type;
2496 LLVMValueRef res;
2497
2498 assert(type.floating);
2499 assert(lp_check_value(type, a));
2500
2501 if (arch_rounding_available(type)) {
2502 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2503 }
2504 else {
2505 struct lp_type inttype;
2506 struct lp_build_context intbld;
2507 LLVMValueRef trunc, itrunc, mask;
2508
2509 assert(type.floating);
2510 assert(lp_check_value(type, a));
2511
2512 inttype = type;
2513 inttype.floating = 0;
2514 lp_build_context_init(&intbld, bld->gallivm, inttype);
2515
2516 /* round by truncation */
2517 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2518 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2519
2520 /*
2521 * fix values if rounding is wrong (for non-special cases)
2522 * - this is the case if trunc < a
2523 * The results of doing this with NaNs, very large values etc.
2524 * are undefined but this seems to be the case anyway.
2525 */
2526 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2527 /* cheapie plus one with mask since the mask is minus one / zero */
2528 return lp_build_sub(&intbld, itrunc, mask);
2529 }
2530
2531 /* round to nearest (toward zero) */
2532 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2533
2534 return res;
2535 }
2536
2537
2538 /**
2539 * Combined ifloor() & fract().
2540 *
2541 * Preferred to calling the functions separately, as it will ensure that the
2542 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2543 */
2544 void
2545 lp_build_ifloor_fract(struct lp_build_context *bld,
2546 LLVMValueRef a,
2547 LLVMValueRef *out_ipart,
2548 LLVMValueRef *out_fpart)
2549 {
2550 LLVMBuilderRef builder = bld->gallivm->builder;
2551 const struct lp_type type = bld->type;
2552 LLVMValueRef ipart;
2553
2554 assert(type.floating);
2555 assert(lp_check_value(type, a));
2556
2557 if (arch_rounding_available(type)) {
2558 /*
2559 * floor() is easier.
2560 */
2561
2562 ipart = lp_build_floor(bld, a);
2563 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2564 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2565 }
2566 else {
2567 /*
2568 * ifloor() is easier.
2569 */
2570
2571 *out_ipart = lp_build_ifloor(bld, a);
2572 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2573 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2574 }
2575 }
2576
2577
2578 /**
2579 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2580 * always smaller than one.
2581 */
2582 void
2583 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2584 LLVMValueRef a,
2585 LLVMValueRef *out_ipart,
2586 LLVMValueRef *out_fpart)
2587 {
2588 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2589 *out_fpart = clamp_fract(bld, *out_fpart);
2590 }
2591
2592
2593 LLVMValueRef
2594 lp_build_sqrt(struct lp_build_context *bld,
2595 LLVMValueRef a)
2596 {
2597 LLVMBuilderRef builder = bld->gallivm->builder;
2598 const struct lp_type type = bld->type;
2599 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2600 char intrinsic[32];
2601
2602 assert(lp_check_value(type, a));
2603
2604 assert(type.floating);
2605 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2606
2607 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2608 }
2609
2610
2611 /**
2612 * Do one Newton-Raphson step to improve reciprocate precision:
2613 *
2614 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2615 *
2616 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2617 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2618 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2619 * halo. It would be necessary to clamp the argument to prevent this.
2620 *
2621 * See also:
2622 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2623 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2624 */
2625 static inline LLVMValueRef
2626 lp_build_rcp_refine(struct lp_build_context *bld,
2627 LLVMValueRef a,
2628 LLVMValueRef rcp_a)
2629 {
2630 LLVMBuilderRef builder = bld->gallivm->builder;
2631 LLVMValueRef neg_a;
2632 LLVMValueRef res;
2633
2634 neg_a = LLVMBuildFNeg(builder, a, "");
2635 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2636 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2637
2638 return res;
2639 }
2640
2641
2642 LLVMValueRef
2643 lp_build_rcp(struct lp_build_context *bld,
2644 LLVMValueRef a)
2645 {
2646 LLVMBuilderRef builder = bld->gallivm->builder;
2647 const struct lp_type type = bld->type;
2648
2649 assert(lp_check_value(type, a));
2650
2651 if(a == bld->zero)
2652 return bld->undef;
2653 if(a == bld->one)
2654 return bld->one;
2655 if(a == bld->undef)
2656 return bld->undef;
2657
2658 assert(type.floating);
2659
2660 if(LLVMIsConstant(a))
2661 return LLVMConstFDiv(bld->one, a);
2662
2663 /*
2664 * We don't use RCPPS because:
2665 * - it only has 10bits of precision
2666 * - it doesn't even get the reciprocate of 1.0 exactly
2667 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2668 * - for recent processors the benefit over DIVPS is marginal, a case
2669 * dependent
2670 *
2671 * We could still use it on certain processors if benchmarks show that the
2672 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2673 * particular uses that require less workarounds.
2674 */
2675
2676 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2677 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2678 const unsigned num_iterations = 0;
2679 LLVMValueRef res;
2680 unsigned i;
2681 const char *intrinsic = NULL;
2682
2683 if (type.length == 4) {
2684 intrinsic = "llvm.x86.sse.rcp.ps";
2685 }
2686 else {
2687 intrinsic = "llvm.x86.avx.rcp.ps.256";
2688 }
2689
2690 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2691
2692 for (i = 0; i < num_iterations; ++i) {
2693 res = lp_build_rcp_refine(bld, a, res);
2694 }
2695
2696 return res;
2697 }
2698
2699 return LLVMBuildFDiv(builder, bld->one, a, "");
2700 }
2701
2702
2703 /**
2704 * Do one Newton-Raphson step to improve rsqrt precision:
2705 *
2706 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2707 *
2708 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2709 */
2710 static inline LLVMValueRef
2711 lp_build_rsqrt_refine(struct lp_build_context *bld,
2712 LLVMValueRef a,
2713 LLVMValueRef rsqrt_a)
2714 {
2715 LLVMBuilderRef builder = bld->gallivm->builder;
2716 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2717 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2718 LLVMValueRef res;
2719
2720 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2721 res = LLVMBuildFMul(builder, a, res, "");
2722 res = LLVMBuildFSub(builder, three, res, "");
2723 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2724 res = LLVMBuildFMul(builder, half, res, "");
2725
2726 return res;
2727 }
2728
2729
2730 /**
2731 * Generate 1/sqrt(a).
2732 * Result is undefined for values < 0, infinity for +0.
2733 */
2734 LLVMValueRef
2735 lp_build_rsqrt(struct lp_build_context *bld,
2736 LLVMValueRef a)
2737 {
2738 const struct lp_type type = bld->type;
2739
2740 assert(lp_check_value(type, a));
2741
2742 assert(type.floating);
2743
2744 /*
2745 * This should be faster but all denormals will end up as infinity.
2746 */
2747 if (0 && lp_build_fast_rsqrt_available(type)) {
2748 const unsigned num_iterations = 1;
2749 LLVMValueRef res;
2750 unsigned i;
2751
2752 /* rsqrt(1.0) != 1.0 here */
2753 res = lp_build_fast_rsqrt(bld, a);
2754
2755 if (num_iterations) {
2756 /*
2757 * Newton-Raphson will result in NaN instead of infinity for zero,
2758 * and NaN instead of zero for infinity.
2759 * Also, need to ensure rsqrt(1.0) == 1.0.
2760 * All numbers smaller than FLT_MIN will result in +infinity
2761 * (rsqrtps treats all denormals as zero).
2762 */
2763 LLVMValueRef cmp;
2764 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2765 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2766
2767 for (i = 0; i < num_iterations; ++i) {
2768 res = lp_build_rsqrt_refine(bld, a, res);
2769 }
2770 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2771 res = lp_build_select(bld, cmp, inf, res);
2772 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2773 res = lp_build_select(bld, cmp, bld->zero, res);
2774 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2775 res = lp_build_select(bld, cmp, bld->one, res);
2776 }
2777
2778 return res;
2779 }
2780
2781 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2782 }
2783
2784 /**
2785 * If there's a fast (inaccurate) rsqrt instruction available
2786 * (caller may want to avoid to call rsqrt_fast if it's not available,
2787 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2788 * unavailable it would result in sqrt/div/mul so obviously
2789 * much better to just call sqrt, skipping both div and mul).
2790 */
2791 boolean
2792 lp_build_fast_rsqrt_available(struct lp_type type)
2793 {
2794 assert(type.floating);
2795
2796 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2797 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2798 return true;
2799 }
2800 return false;
2801 }
2802
2803
2804 /**
2805 * Generate 1/sqrt(a).
2806 * Result is undefined for values < 0, infinity for +0.
2807 * Precision is limited, only ~10 bits guaranteed
2808 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2809 */
2810 LLVMValueRef
2811 lp_build_fast_rsqrt(struct lp_build_context *bld,
2812 LLVMValueRef a)
2813 {
2814 LLVMBuilderRef builder = bld->gallivm->builder;
2815 const struct lp_type type = bld->type;
2816
2817 assert(lp_check_value(type, a));
2818
2819 if (lp_build_fast_rsqrt_available(type)) {
2820 const char *intrinsic = NULL;
2821
2822 if (type.length == 4) {
2823 intrinsic = "llvm.x86.sse.rsqrt.ps";
2824 }
2825 else {
2826 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2827 }
2828 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2829 }
2830 else {
2831 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2832 }
2833 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2834 }
2835
2836
2837 /**
2838 * Generate sin(a) or cos(a) using polynomial approximation.
2839 * TODO: it might be worth recognizing sin and cos using same source
2840 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2841 * would be way cheaper than calculating (nearly) everything twice...
2842 * Not sure it's common enough to be worth bothering however, scs
2843 * opcode could also benefit from calculating both though.
2844 */
2845 static LLVMValueRef
2846 lp_build_sin_or_cos(struct lp_build_context *bld,
2847 LLVMValueRef a,
2848 boolean cos)
2849 {
2850 struct gallivm_state *gallivm = bld->gallivm;
2851 LLVMBuilderRef b = gallivm->builder;
2852 struct lp_type int_type = lp_int_type(bld->type);
2853
2854 /*
2855 * take the absolute value,
2856 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2857 */
2858
2859 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2860 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2861
2862 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2863 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2864
2865 /*
2866 * scale by 4/Pi
2867 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2868 */
2869
2870 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2871 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2872
2873 /*
2874 * store the integer part of y in mm0
2875 * emm2 = _mm_cvttps_epi32(y);
2876 */
2877
2878 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2879
2880 /*
2881 * j=(j+1) & (~1) (see the cephes sources)
2882 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2883 */
2884
2885 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2886 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2887 /*
2888 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2889 */
2890 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2891 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2892
2893 /*
2894 * y = _mm_cvtepi32_ps(emm2);
2895 */
2896 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2897
2898 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2899 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2900 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2901 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2902
2903 /*
2904 * Argument used for poly selection and sign bit determination
2905 * is different for sin vs. cos.
2906 */
2907 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2908 emm2_and;
2909
2910 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2911 LLVMBuildNot(b, emm2_2, ""), ""),
2912 const_29, "sign_bit") :
2913 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2914 LLVMBuildShl(b, emm2_add,
2915 const_29, ""), ""),
2916 sign_mask, "sign_bit");
2917
2918 /*
2919 * get the polynom selection mask
2920 * there is one polynom for 0 <= x <= Pi/4
2921 * and another one for Pi/4<x<=Pi/2
2922 * Both branches will be computed.
2923 *
2924 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2925 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2926 */
2927
2928 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2929 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2930 int_type, PIPE_FUNC_EQUAL,
2931 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2932
2933 /*
2934 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2935 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2936 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2937 */
2938 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2939 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2940 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2941
2942 /*
2943 * The magic pass: "Extended precision modular arithmetic"
2944 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2945 */
2946 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2947 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2948 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2949
2950 /*
2951 * Evaluate the first polynom (0 <= x <= Pi/4)
2952 *
2953 * z = _mm_mul_ps(x,x);
2954 */
2955 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2956
2957 /*
2958 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2959 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2960 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2961 */
2962 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2963 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2964 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2965
2966 /*
2967 * y = *(v4sf*)_ps_coscof_p0;
2968 * y = _mm_mul_ps(y, z);
2969 */
2970 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2971 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2972 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2973 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2974
2975
2976 /*
2977 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2978 * y = _mm_sub_ps(y, tmp);
2979 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2980 */
2981 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2982 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2983 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2984 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2985 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2986
2987 /*
2988 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2989 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2990 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2991 */
2992 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2993 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2994 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2995
2996 /*
2997 * Evaluate the second polynom (Pi/4 <= x <= 0)
2998 *
2999 * y2 = *(v4sf*)_ps_sincof_p0;
3000 * y2 = _mm_mul_ps(y2, z);
3001 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3002 * y2 = _mm_mul_ps(y2, z);
3003 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3004 * y2 = _mm_mul_ps(y2, z);
3005 * y2 = _mm_mul_ps(y2, x);
3006 * y2 = _mm_add_ps(y2, x);
3007 */
3008
3009 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3010 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3011 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3012 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3013
3014 /*
3015 * select the correct result from the two polynoms
3016 * xmm3 = poly_mask;
3017 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3018 * y = _mm_andnot_ps(xmm3, y);
3019 * y = _mm_or_ps(y,y2);
3020 */
3021 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3022 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3023 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3024 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3025 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3026 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3027
3028 /*
3029 * update the sign
3030 * y = _mm_xor_ps(y, sign_bit);
3031 */
3032 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3033 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3034
3035 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3036
3037 /* clamp output to be within [-1, 1] */
3038 y_result = lp_build_clamp(bld, y_result,
3039 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3040 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3041 /* If a is -inf, inf or NaN then return NaN */
3042 y_result = lp_build_select(bld, isfinite, y_result,
3043 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3044 return y_result;
3045 }
3046
3047
3048 /**
3049 * Generate sin(a)
3050 */
3051 LLVMValueRef
3052 lp_build_sin(struct lp_build_context *bld,
3053 LLVMValueRef a)
3054 {
3055 return lp_build_sin_or_cos(bld, a, FALSE);
3056 }
3057
3058
3059 /**
3060 * Generate cos(a)
3061 */
3062 LLVMValueRef
3063 lp_build_cos(struct lp_build_context *bld,
3064 LLVMValueRef a)
3065 {
3066 return lp_build_sin_or_cos(bld, a, TRUE);
3067 }
3068
3069
3070 /**
3071 * Generate pow(x, y)
3072 */
3073 LLVMValueRef
3074 lp_build_pow(struct lp_build_context *bld,
3075 LLVMValueRef x,
3076 LLVMValueRef y)
3077 {
3078 /* TODO: optimize the constant case */
3079 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3080 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3081 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3082 __FUNCTION__);
3083 }
3084
3085 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3086 }
3087
3088
3089 /**
3090 * Generate exp(x)
3091 */
3092 LLVMValueRef
3093 lp_build_exp(struct lp_build_context *bld,
3094 LLVMValueRef x)
3095 {
3096 /* log2(e) = 1/log(2) */
3097 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3098 1.4426950408889634);
3099
3100 assert(lp_check_value(bld->type, x));
3101
3102 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3103 }
3104
3105
3106 /**
3107 * Generate log(x)
3108 * Behavior is undefined with infs, 0s and nans
3109 */
3110 LLVMValueRef
3111 lp_build_log(struct lp_build_context *bld,
3112 LLVMValueRef x)
3113 {
3114 /* log(2) */
3115 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3116 0.69314718055994529);
3117
3118 assert(lp_check_value(bld->type, x));
3119
3120 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3121 }
3122
3123 /**
3124 * Generate log(x) that handles edge cases (infs, 0s and nans)
3125 */
3126 LLVMValueRef
3127 lp_build_log_safe(struct lp_build_context *bld,
3128 LLVMValueRef x)
3129 {
3130 /* log(2) */
3131 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3132 0.69314718055994529);
3133
3134 assert(lp_check_value(bld->type, x));
3135
3136 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3137 }
3138
3139
3140 /**
3141 * Generate polynomial.
3142 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3143 */
3144 LLVMValueRef
3145 lp_build_polynomial(struct lp_build_context *bld,
3146 LLVMValueRef x,
3147 const double *coeffs,
3148 unsigned num_coeffs)
3149 {
3150 const struct lp_type type = bld->type;
3151 LLVMValueRef even = NULL, odd = NULL;
3152 LLVMValueRef x2;
3153 unsigned i;
3154
3155 assert(lp_check_value(bld->type, x));
3156
3157 /* TODO: optimize the constant case */
3158 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3159 LLVMIsConstant(x)) {
3160 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3161 __FUNCTION__);
3162 }
3163
3164 /*
3165 * Calculate odd and even terms seperately to decrease data dependency
3166 * Ex:
3167 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3168 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3169 */
3170 x2 = lp_build_mul(bld, x, x);
3171
3172 for (i = num_coeffs; i--; ) {
3173 LLVMValueRef coeff;
3174
3175 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3176
3177 if (i % 2 == 0) {
3178 if (even)
3179 even = lp_build_mad(bld, x2, even, coeff);
3180 else
3181 even = coeff;
3182 } else {
3183 if (odd)
3184 odd = lp_build_mad(bld, x2, odd, coeff);
3185 else
3186 odd = coeff;
3187 }
3188 }
3189
3190 if (odd)
3191 return lp_build_mad(bld, odd, x, even);
3192 else if (even)
3193 return even;
3194 else
3195 return bld->undef;
3196 }
3197
3198
3199 /**
3200 * Minimax polynomial fit of 2**x, in range [0, 1[
3201 */
3202 const double lp_build_exp2_polynomial[] = {
3203 #if EXP_POLY_DEGREE == 5
3204 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3205 0.693153073200168932794,
3206 0.240153617044375388211,
3207 0.0558263180532956664775,
3208 0.00898934009049466391101,
3209 0.00187757667519147912699
3210 #elif EXP_POLY_DEGREE == 4
3211 1.00000259337069434683,
3212 0.693003834469974940458,
3213 0.24144275689150793076,
3214 0.0520114606103070150235,
3215 0.0135341679161270268764
3216 #elif EXP_POLY_DEGREE == 3
3217 0.999925218562710312959,
3218 0.695833540494823811697,
3219 0.226067155427249155588,
3220 0.0780245226406372992967
3221 #elif EXP_POLY_DEGREE == 2
3222 1.00172476321474503578,
3223 0.657636275736077639316,
3224 0.33718943461968720704
3225 #else
3226 #error
3227 #endif
3228 };
3229
3230
3231 LLVMValueRef
3232 lp_build_exp2(struct lp_build_context *bld,
3233 LLVMValueRef x)
3234 {
3235 LLVMBuilderRef builder = bld->gallivm->builder;
3236 const struct lp_type type = bld->type;
3237 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3238 LLVMValueRef ipart = NULL;
3239 LLVMValueRef fpart = NULL;
3240 LLVMValueRef expipart = NULL;
3241 LLVMValueRef expfpart = NULL;
3242 LLVMValueRef res = NULL;
3243
3244 assert(lp_check_value(bld->type, x));
3245
3246 /* TODO: optimize the constant case */
3247 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3248 LLVMIsConstant(x)) {
3249 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3250 __FUNCTION__);
3251 }
3252
3253 assert(type.floating && type.width == 32);
3254
3255 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3256 * the result is INF and if it's smaller than -126.9 the result is 0 */
3257 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3258 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3259 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3260 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3261
3262 /* ipart = floor(x) */
3263 /* fpart = x - ipart */
3264 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3265
3266 /* expipart = (float) (1 << ipart) */
3267 expipart = LLVMBuildAdd(builder, ipart,
3268 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3269 expipart = LLVMBuildShl(builder, expipart,
3270 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3271 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3272
3273 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3274 ARRAY_SIZE(lp_build_exp2_polynomial));
3275
3276 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3277
3278 return res;
3279 }
3280
3281
3282
3283 /**
3284 * Extract the exponent of a IEEE-754 floating point value.
3285 *
3286 * Optionally apply an integer bias.
3287 *
3288 * Result is an integer value with
3289 *
3290 * ifloor(log2(x)) + bias
3291 */
3292 LLVMValueRef
3293 lp_build_extract_exponent(struct lp_build_context *bld,
3294 LLVMValueRef x,
3295 int bias)
3296 {
3297 LLVMBuilderRef builder = bld->gallivm->builder;
3298 const struct lp_type type = bld->type;
3299 unsigned mantissa = lp_mantissa(type);
3300 LLVMValueRef res;
3301
3302 assert(type.floating);
3303
3304 assert(lp_check_value(bld->type, x));
3305
3306 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3307
3308 res = LLVMBuildLShr(builder, x,
3309 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3310 res = LLVMBuildAnd(builder, res,
3311 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3312 res = LLVMBuildSub(builder, res,
3313 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3314
3315 return res;
3316 }
3317
3318
3319 /**
3320 * Extract the mantissa of the a floating.
3321 *
3322 * Result is a floating point value with
3323 *
3324 * x / floor(log2(x))
3325 */
3326 LLVMValueRef
3327 lp_build_extract_mantissa(struct lp_build_context *bld,
3328 LLVMValueRef x)
3329 {
3330 LLVMBuilderRef builder = bld->gallivm->builder;
3331 const struct lp_type type = bld->type;
3332 unsigned mantissa = lp_mantissa(type);
3333 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3334 (1ULL << mantissa) - 1);
3335 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3336 LLVMValueRef res;
3337
3338 assert(lp_check_value(bld->type, x));
3339
3340 assert(type.floating);
3341
3342 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3343
3344 /* res = x / 2**ipart */
3345 res = LLVMBuildAnd(builder, x, mantmask, "");
3346 res = LLVMBuildOr(builder, res, one, "");
3347 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3348
3349 return res;
3350 }
3351
3352
3353
3354 /**
3355 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3356 * These coefficients can be generate with
3357 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3358 */
3359 const double lp_build_log2_polynomial[] = {
3360 #if LOG_POLY_DEGREE == 5
3361 2.88539008148777786488L,
3362 0.961796878841293367824L,
3363 0.577058946784739859012L,
3364 0.412914355135828735411L,
3365 0.308591899232910175289L,
3366 0.352376952300281371868L,
3367 #elif LOG_POLY_DEGREE == 4
3368 2.88539009343309178325L,
3369 0.961791550404184197881L,
3370 0.577440339438736392009L,
3371 0.403343858251329912514L,
3372 0.406718052498846252698L,
3373 #elif LOG_POLY_DEGREE == 3
3374 2.88538959748872753838L,
3375 0.961932915889597772928L,
3376 0.571118517972136195241L,
3377 0.493997535084709500285L,
3378 #else
3379 #error
3380 #endif
3381 };
3382
3383 /**
3384 * See http://www.devmaster.net/forums/showthread.php?p=43580
3385 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3386 * http://www.nezumi.demon.co.uk/consult/logx.htm
3387 *
3388 * If handle_edge_cases is true the function will perform computations
3389 * to match the required D3D10+ behavior for each of the edge cases.
3390 * That means that if input is:
3391 * - less than zero (to and including -inf) then NaN will be returned
3392 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3393 * - +infinity, then +infinity will be returned
3394 * - NaN, then NaN will be returned
3395 *
3396 * Those checks are fairly expensive so if you don't need them make sure
3397 * handle_edge_cases is false.
3398 */
3399 void
3400 lp_build_log2_approx(struct lp_build_context *bld,
3401 LLVMValueRef x,
3402 LLVMValueRef *p_exp,
3403 LLVMValueRef *p_floor_log2,
3404 LLVMValueRef *p_log2,
3405 boolean handle_edge_cases)
3406 {
3407 LLVMBuilderRef builder = bld->gallivm->builder;
3408 const struct lp_type type = bld->type;
3409 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3410 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3411
3412 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3413 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3414 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3415
3416 LLVMValueRef i = NULL;
3417 LLVMValueRef y = NULL;
3418 LLVMValueRef z = NULL;
3419 LLVMValueRef exp = NULL;
3420 LLVMValueRef mant = NULL;
3421 LLVMValueRef logexp = NULL;
3422 LLVMValueRef p_z = NULL;
3423 LLVMValueRef res = NULL;
3424
3425 assert(lp_check_value(bld->type, x));
3426
3427 if(p_exp || p_floor_log2 || p_log2) {
3428 /* TODO: optimize the constant case */
3429 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3430 LLVMIsConstant(x)) {
3431 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3432 __FUNCTION__);
3433 }
3434
3435 assert(type.floating && type.width == 32);
3436
3437 /*
3438 * We don't explicitly handle denormalized numbers. They will yield a
3439 * result in the neighbourhood of -127, which appears to be adequate
3440 * enough.
3441 */
3442
3443 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3444
3445 /* exp = (float) exponent(x) */
3446 exp = LLVMBuildAnd(builder, i, expmask, "");
3447 }
3448
3449 if(p_floor_log2 || p_log2) {
3450 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3451 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3452 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3453 }
3454
3455 if (p_log2) {
3456 /* mant = 1 + (float) mantissa(x) */
3457 mant = LLVMBuildAnd(builder, i, mantmask, "");
3458 mant = LLVMBuildOr(builder, mant, one, "");
3459 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3460
3461 /* y = (mant - 1) / (mant + 1) */
3462 y = lp_build_div(bld,
3463 lp_build_sub(bld, mant, bld->one),
3464 lp_build_add(bld, mant, bld->one)
3465 );
3466
3467 /* z = y^2 */
3468 z = lp_build_mul(bld, y, y);
3469
3470 /* compute P(z) */
3471 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3472 ARRAY_SIZE(lp_build_log2_polynomial));
3473
3474 /* y * P(z) + logexp */
3475 res = lp_build_mad(bld, y, p_z, logexp);
3476
3477 if (type.floating && handle_edge_cases) {
3478 LLVMValueRef negmask, infmask, zmask;
3479 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3480 lp_build_const_vec(bld->gallivm, type, 0.0f));
3481 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3482 lp_build_const_vec(bld->gallivm, type, 0.0f));
3483 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3484 lp_build_const_vec(bld->gallivm, type, INFINITY));
3485
3486 /* If x is qual to inf make sure we return inf */
3487 res = lp_build_select(bld, infmask,
3488 lp_build_const_vec(bld->gallivm, type, INFINITY),
3489 res);
3490 /* If x is qual to 0, return -inf */
3491 res = lp_build_select(bld, zmask,
3492 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3493 res);
3494 /* If x is nan or less than 0, return nan */
3495 res = lp_build_select(bld, negmask,
3496 lp_build_const_vec(bld->gallivm, type, NAN),
3497 res);
3498 }
3499 }
3500
3501 if (p_exp) {
3502 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3503 *p_exp = exp;
3504 }
3505
3506 if (p_floor_log2)
3507 *p_floor_log2 = logexp;
3508
3509 if (p_log2)
3510 *p_log2 = res;
3511 }
3512
3513
3514 /*
3515 * log2 implementation which doesn't have special code to
3516 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3517 * the results for those cases are undefined.
3518 */
3519 LLVMValueRef
3520 lp_build_log2(struct lp_build_context *bld,
3521 LLVMValueRef x)
3522 {
3523 LLVMValueRef res;
3524 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3525 return res;
3526 }
3527
3528 /*
3529 * Version of log2 which handles all edge cases.
3530 * Look at documentation of lp_build_log2_approx for
3531 * description of the behavior for each of the edge cases.
3532 */
3533 LLVMValueRef
3534 lp_build_log2_safe(struct lp_build_context *bld,
3535 LLVMValueRef x)
3536 {
3537 LLVMValueRef res;
3538 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3539 return res;
3540 }
3541
3542
3543 /**
3544 * Faster (and less accurate) log2.
3545 *
3546 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3547 *
3548 * Piece-wise linear approximation, with exact results when x is a
3549 * power of two.
3550 *
3551 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3552 */
3553 LLVMValueRef
3554 lp_build_fast_log2(struct lp_build_context *bld,
3555 LLVMValueRef x)
3556 {
3557 LLVMBuilderRef builder = bld->gallivm->builder;
3558 LLVMValueRef ipart;
3559 LLVMValueRef fpart;
3560
3561 assert(lp_check_value(bld->type, x));
3562
3563 assert(bld->type.floating);
3564
3565 /* ipart = floor(log2(x)) - 1 */
3566 ipart = lp_build_extract_exponent(bld, x, -1);
3567 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3568
3569 /* fpart = x / 2**ipart */
3570 fpart = lp_build_extract_mantissa(bld, x);
3571
3572 /* ipart + fpart */
3573 return LLVMBuildFAdd(builder, ipart, fpart, "");
3574 }
3575
3576
3577 /**
3578 * Fast implementation of iround(log2(x)).
3579 *
3580 * Not an approximation -- it should give accurate results all the time.
3581 */
3582 LLVMValueRef
3583 lp_build_ilog2(struct lp_build_context *bld,
3584 LLVMValueRef x)
3585 {
3586 LLVMBuilderRef builder = bld->gallivm->builder;
3587 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3588 LLVMValueRef ipart;
3589
3590 assert(bld->type.floating);
3591
3592 assert(lp_check_value(bld->type, x));
3593
3594 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3595 x = LLVMBuildFMul(builder, x, sqrt2, "");
3596
3597 /* ipart = floor(log2(x) + 0.5) */
3598 ipart = lp_build_extract_exponent(bld, x, 0);
3599
3600 return ipart;
3601 }
3602
3603 LLVMValueRef
3604 lp_build_mod(struct lp_build_context *bld,
3605 LLVMValueRef x,
3606 LLVMValueRef y)
3607 {
3608 LLVMBuilderRef builder = bld->gallivm->builder;
3609 LLVMValueRef res;
3610 const struct lp_type type = bld->type;
3611
3612 assert(lp_check_value(type, x));
3613 assert(lp_check_value(type, y));
3614
3615 if (type.floating)
3616 res = LLVMBuildFRem(builder, x, y, "");
3617 else if (type.sign)
3618 res = LLVMBuildSRem(builder, x, y, "");
3619 else
3620 res = LLVMBuildURem(builder, x, y, "");
3621 return res;
3622 }
3623
3624
3625 /*
3626 * For floating inputs it creates and returns a mask
3627 * which is all 1's for channels which are NaN.
3628 * Channels inside x which are not NaN will be 0.
3629 */
3630 LLVMValueRef
3631 lp_build_isnan(struct lp_build_context *bld,
3632 LLVMValueRef x)
3633 {
3634 LLVMValueRef mask;
3635 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3636
3637 assert(bld->type.floating);
3638 assert(lp_check_value(bld->type, x));
3639
3640 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3641 "isnotnan");
3642 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3643 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3644 return mask;
3645 }
3646
3647 /* Returns all 1's for floating point numbers that are
3648 * finite numbers and returns all zeros for -inf,
3649 * inf and nan's */
3650 LLVMValueRef
3651 lp_build_isfinite(struct lp_build_context *bld,
3652 LLVMValueRef x)
3653 {
3654 LLVMBuilderRef builder = bld->gallivm->builder;
3655 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3656 struct lp_type int_type = lp_int_type(bld->type);
3657 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3658 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3659 0x7f800000);
3660
3661 if (!bld->type.floating) {
3662 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3663 }
3664 assert(bld->type.floating);
3665 assert(lp_check_value(bld->type, x));
3666 assert(bld->type.width == 32);
3667
3668 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3669 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3670 intx, infornan32);
3671 }
3672
3673 /*
3674 * Returns true if the number is nan or inf and false otherwise.
3675 * The input has to be a floating point vector.
3676 */
3677 LLVMValueRef
3678 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3679 const struct lp_type type,
3680 LLVMValueRef x)
3681 {
3682 LLVMBuilderRef builder = gallivm->builder;
3683 struct lp_type int_type = lp_int_type(type);
3684 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3685 0x7f800000);
3686 LLVMValueRef ret;
3687
3688 assert(type.floating);
3689
3690 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3691 ret = LLVMBuildAnd(builder, ret, const0, "");
3692 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3693 ret, const0);
3694
3695 return ret;
3696 }
3697
3698
3699 LLVMValueRef
3700 lp_build_fpstate_get(struct gallivm_state *gallivm)
3701 {
3702 if (util_cpu_caps.has_sse) {
3703 LLVMBuilderRef builder = gallivm->builder;
3704 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3705 gallivm,
3706 LLVMInt32TypeInContext(gallivm->context),
3707 "mxcsr_ptr");
3708 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3709 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3710 lp_build_intrinsic(builder,
3711 "llvm.x86.sse.stmxcsr",
3712 LLVMVoidTypeInContext(gallivm->context),
3713 &mxcsr_ptr8, 1, 0);
3714 return mxcsr_ptr;
3715 }
3716 return 0;
3717 }
3718
3719 void
3720 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3721 boolean zero)
3722 {
3723 if (util_cpu_caps.has_sse) {
3724 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3725 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3726
3727 LLVMBuilderRef builder = gallivm->builder;
3728 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3729 LLVMValueRef mxcsr =
3730 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3731
3732 if (util_cpu_caps.has_daz) {
3733 /* Enable denormals are zero mode */
3734 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3735 }
3736 if (zero) {
3737 mxcsr = LLVMBuildOr(builder, mxcsr,
3738 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3739 } else {
3740 mxcsr = LLVMBuildAnd(builder, mxcsr,
3741 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3742 }
3743
3744 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3745 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3746 }
3747 }
3748
3749 void
3750 lp_build_fpstate_set(struct gallivm_state *gallivm,
3751 LLVMValueRef mxcsr_ptr)
3752 {
3753 if (util_cpu_caps.has_sse) {
3754 LLVMBuilderRef builder = gallivm->builder;
3755 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3756 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3757 lp_build_intrinsic(builder,
3758 "llvm.x86.sse.ldmxcsr",
3759 LLVMVoidTypeInContext(gallivm->context),
3760 &mxcsr_ptr, 1, 0);
3761 }
3762 }