gallivm: fix saturated signed add / sub with llvm 9
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if (a == bld->zero)
545 return b;
546 if (b == bld->zero)
547 return a;
548 if (a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if (type.norm) {
552 const char *intrinsic = NULL;
553
554 if (!type.sign && (a == bld->one || b == bld->one))
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (HAVE_LLVM >= 0x0900) {
559 char intrin[32];
560 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
561 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
562 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
563 }
564 if (type.width * type.length == 128) {
565 if (util_cpu_caps.has_sse2) {
566 if (type.width == 8)
567 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
568 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
569 if (type.width == 16)
570 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
571 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
572 } else if (util_cpu_caps.has_altivec) {
573 if (type.width == 8)
574 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
575 if (type.width == 16)
576 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
577 }
578 }
579 if (type.width * type.length == 256) {
580 if (util_cpu_caps.has_avx2) {
581 if (type.width == 8)
582 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
583 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
584 if (type.width == 16)
585 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
586 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
587 }
588 }
589 }
590
591 if (intrinsic)
592 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
593 }
594
595 if(type.norm && !type.floating && !type.fixed) {
596 if (type.sign) {
597 uint64_t sign = (uint64_t)1 << (type.width - 1);
598 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
599 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
600 /* a_clamp_max is the maximum a for positive b,
601 a_clamp_min is the minimum a for negative b. */
602 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
603 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
604 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
605 }
606 }
607
608 if(LLVMIsConstant(a) && LLVMIsConstant(b))
609 if (type.floating)
610 res = LLVMConstFAdd(a, b);
611 else
612 res = LLVMConstAdd(a, b);
613 else
614 if (type.floating)
615 res = LLVMBuildFAdd(builder, a, b, "");
616 else
617 res = LLVMBuildAdd(builder, a, b, "");
618
619 /* clamp to ceiling of 1.0 */
620 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
621 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
622
623 if (type.norm && !type.floating && !type.fixed) {
624 if (!type.sign) {
625 /*
626 * newer llvm versions no longer support the intrinsics, but recognize
627 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
628 * code, it is important we match the pattern llvm uses (and pray llvm
629 * doesn't change it - and hope they decide on the same pattern for
630 * all backends supporting it...).
631 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
632 * interfere with llvm's ability to recognize the pattern but seems
633 * a bit brittle.
634 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
635 */
636 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
637 res = lp_build_select(bld, overflowed,
638 LLVMConstAllOnes(bld->int_vec_type), res);
639 }
640 }
641
642 /* XXX clamp to floor of -1 or 0??? */
643
644 return res;
645 }
646
647
648 /** Return the scalar sum of the elements of a.
649 * Should avoid this operation whenever possible.
650 */
651 LLVMValueRef
652 lp_build_horizontal_add(struct lp_build_context *bld,
653 LLVMValueRef a)
654 {
655 LLVMBuilderRef builder = bld->gallivm->builder;
656 const struct lp_type type = bld->type;
657 LLVMValueRef index, res;
658 unsigned i, length;
659 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
660 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
661 LLVMValueRef vecres, elem2;
662
663 assert(lp_check_value(type, a));
664
665 if (type.length == 1) {
666 return a;
667 }
668
669 assert(!bld->type.norm);
670
671 /*
672 * for byte vectors can do much better with psadbw.
673 * Using repeated shuffle/adds here. Note with multiple vectors
674 * this can be done more efficiently as outlined in the intel
675 * optimization manual.
676 * Note: could cause data rearrangement if used with smaller element
677 * sizes.
678 */
679
680 vecres = a;
681 length = type.length / 2;
682 while (length > 1) {
683 LLVMValueRef vec1, vec2;
684 for (i = 0; i < length; i++) {
685 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
686 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
687 }
688 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
689 LLVMConstVector(shuffles1, length), "");
690 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
691 LLVMConstVector(shuffles2, length), "");
692 if (type.floating) {
693 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
694 }
695 else {
696 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
697 }
698 length = length >> 1;
699 }
700
701 /* always have vector of size 2 here */
702 assert(length == 1);
703
704 index = lp_build_const_int32(bld->gallivm, 0);
705 res = LLVMBuildExtractElement(builder, vecres, index, "");
706 index = lp_build_const_int32(bld->gallivm, 1);
707 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
708
709 if (type.floating)
710 res = LLVMBuildFAdd(builder, res, elem2, "");
711 else
712 res = LLVMBuildAdd(builder, res, elem2, "");
713
714 return res;
715 }
716
717 /**
718 * Return the horizontal sums of 4 float vectors as a float4 vector.
719 * This uses the technique as outlined in Intel Optimization Manual.
720 */
721 static LLVMValueRef
722 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
723 LLVMValueRef src[4])
724 {
725 struct gallivm_state *gallivm = bld->gallivm;
726 LLVMBuilderRef builder = gallivm->builder;
727 LLVMValueRef shuffles[4];
728 LLVMValueRef tmp[4];
729 LLVMValueRef sumtmp[2], shuftmp[2];
730
731 /* lower half of regs */
732 shuffles[0] = lp_build_const_int32(gallivm, 0);
733 shuffles[1] = lp_build_const_int32(gallivm, 1);
734 shuffles[2] = lp_build_const_int32(gallivm, 4);
735 shuffles[3] = lp_build_const_int32(gallivm, 5);
736 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
737 LLVMConstVector(shuffles, 4), "");
738 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
739 LLVMConstVector(shuffles, 4), "");
740
741 /* upper half of regs */
742 shuffles[0] = lp_build_const_int32(gallivm, 2);
743 shuffles[1] = lp_build_const_int32(gallivm, 3);
744 shuffles[2] = lp_build_const_int32(gallivm, 6);
745 shuffles[3] = lp_build_const_int32(gallivm, 7);
746 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
747 LLVMConstVector(shuffles, 4), "");
748 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
749 LLVMConstVector(shuffles, 4), "");
750
751 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
752 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
753
754 shuffles[0] = lp_build_const_int32(gallivm, 0);
755 shuffles[1] = lp_build_const_int32(gallivm, 2);
756 shuffles[2] = lp_build_const_int32(gallivm, 4);
757 shuffles[3] = lp_build_const_int32(gallivm, 6);
758 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
759 LLVMConstVector(shuffles, 4), "");
760
761 shuffles[0] = lp_build_const_int32(gallivm, 1);
762 shuffles[1] = lp_build_const_int32(gallivm, 3);
763 shuffles[2] = lp_build_const_int32(gallivm, 5);
764 shuffles[3] = lp_build_const_int32(gallivm, 7);
765 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
766 LLVMConstVector(shuffles, 4), "");
767
768 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
769 }
770
771
772 /*
773 * partially horizontally add 2-4 float vectors with length nx4,
774 * i.e. only four adjacent values in each vector will be added,
775 * assuming values are really grouped in 4 which also determines
776 * output order.
777 *
778 * Return a vector of the same length as the initial vectors,
779 * with the excess elements (if any) being undefined.
780 * The element order is independent of number of input vectors.
781 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
782 * the output order thus will be
783 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
784 */
785 LLVMValueRef
786 lp_build_hadd_partial4(struct lp_build_context *bld,
787 LLVMValueRef vectors[],
788 unsigned num_vecs)
789 {
790 struct gallivm_state *gallivm = bld->gallivm;
791 LLVMBuilderRef builder = gallivm->builder;
792 LLVMValueRef ret_vec;
793 LLVMValueRef tmp[4];
794 const char *intrinsic = NULL;
795
796 assert(num_vecs >= 2 && num_vecs <= 4);
797 assert(bld->type.floating);
798
799 /* only use this with at least 2 vectors, as it is sort of expensive
800 * (depending on cpu) and we always need two horizontal adds anyway,
801 * so a shuffle/add approach might be better.
802 */
803
804 tmp[0] = vectors[0];
805 tmp[1] = vectors[1];
806
807 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
808 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
809
810 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
811 bld->type.length == 4) {
812 intrinsic = "llvm.x86.sse3.hadd.ps";
813 }
814 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
815 bld->type.length == 8) {
816 intrinsic = "llvm.x86.avx.hadd.ps.256";
817 }
818 if (intrinsic) {
819 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
820 lp_build_vec_type(gallivm, bld->type),
821 tmp[0], tmp[1]);
822 if (num_vecs > 2) {
823 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
824 lp_build_vec_type(gallivm, bld->type),
825 tmp[2], tmp[3]);
826 }
827 else {
828 tmp[1] = tmp[0];
829 }
830 return lp_build_intrinsic_binary(builder, intrinsic,
831 lp_build_vec_type(gallivm, bld->type),
832 tmp[0], tmp[1]);
833 }
834
835 if (bld->type.length == 4) {
836 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
837 }
838 else {
839 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
840 unsigned j;
841 unsigned num_iter = bld->type.length / 4;
842 struct lp_type parttype = bld->type;
843 parttype.length = 4;
844 for (j = 0; j < num_iter; j++) {
845 LLVMValueRef partsrc[4];
846 unsigned i;
847 for (i = 0; i < 4; i++) {
848 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
849 }
850 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
851 }
852 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
853 }
854 return ret_vec;
855 }
856
857 /**
858 * Generate a - b
859 */
860 LLVMValueRef
861 lp_build_sub(struct lp_build_context *bld,
862 LLVMValueRef a,
863 LLVMValueRef b)
864 {
865 LLVMBuilderRef builder = bld->gallivm->builder;
866 const struct lp_type type = bld->type;
867 LLVMValueRef res;
868
869 assert(lp_check_value(type, a));
870 assert(lp_check_value(type, b));
871
872 if (b == bld->zero)
873 return a;
874 if (a == bld->undef || b == bld->undef)
875 return bld->undef;
876 if (a == b)
877 return bld->zero;
878
879 if (type.norm) {
880 const char *intrinsic = NULL;
881
882 if (!type.sign && b == bld->one)
883 return bld->zero;
884
885 if (!type.floating && !type.fixed) {
886 if (HAVE_LLVM >= 0x0900) {
887 char intrin[32];
888 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
889 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
890 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
891 }
892 if (type.width * type.length == 128) {
893 if (util_cpu_caps.has_sse2) {
894 if (type.width == 8)
895 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
896 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
897 if (type.width == 16)
898 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
899 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
900 } else if (util_cpu_caps.has_altivec) {
901 if (type.width == 8)
902 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
903 if (type.width == 16)
904 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
905 }
906 }
907 if (type.width * type.length == 256) {
908 if (util_cpu_caps.has_avx2) {
909 if (type.width == 8)
910 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
911 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
912 if (type.width == 16)
913 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
914 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
915 }
916 }
917 }
918
919 if (intrinsic)
920 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
921 }
922
923 if(type.norm && !type.floating && !type.fixed) {
924 if (type.sign) {
925 uint64_t sign = (uint64_t)1 << (type.width - 1);
926 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
927 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
928 /* a_clamp_max is the maximum a for negative b,
929 a_clamp_min is the minimum a for positive b. */
930 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
931 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
932 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
933 } else {
934 /*
935 * This must match llvm pattern for saturated unsigned sub.
936 * (lp_build_max_simple actually does the job with its current
937 * definition but do it explicitly here.)
938 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
939 * interfere with llvm's ability to recognize the pattern but seems
940 * a bit brittle.
941 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
942 */
943 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
944 a = lp_build_select(bld, no_ov, a, b);
945 }
946 }
947
948 if(LLVMIsConstant(a) && LLVMIsConstant(b))
949 if (type.floating)
950 res = LLVMConstFSub(a, b);
951 else
952 res = LLVMConstSub(a, b);
953 else
954 if (type.floating)
955 res = LLVMBuildFSub(builder, a, b, "");
956 else
957 res = LLVMBuildSub(builder, a, b, "");
958
959 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
960 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
961
962 return res;
963 }
964
965
966
967 /**
968 * Normalized multiplication.
969 *
970 * There are several approaches for (using 8-bit normalized multiplication as
971 * an example):
972 *
973 * - alpha plus one
974 *
975 * makes the following approximation to the division (Sree)
976 *
977 * a*b/255 ~= (a*(b + 1)) >> 256
978 *
979 * which is the fastest method that satisfies the following OpenGL criteria of
980 *
981 * 0*0 = 0 and 255*255 = 255
982 *
983 * - geometric series
984 *
985 * takes the geometric series approximation to the division
986 *
987 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
988 *
989 * in this case just the first two terms to fit in 16bit arithmetic
990 *
991 * t/255 ~= (t + (t >> 8)) >> 8
992 *
993 * note that just by itself it doesn't satisfies the OpenGL criteria, as
994 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
995 * must be used.
996 *
997 * - geometric series plus rounding
998 *
999 * when using a geometric series division instead of truncating the result
1000 * use roundoff in the approximation (Jim Blinn)
1001 *
1002 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
1003 *
1004 * achieving the exact results.
1005 *
1006 *
1007 *
1008 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1009 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1010 * @sa Michael Herf, The "double blend trick", May 2000,
1011 * http://www.stereopsis.com/doubleblend.html
1012 */
1013 LLVMValueRef
1014 lp_build_mul_norm(struct gallivm_state *gallivm,
1015 struct lp_type wide_type,
1016 LLVMValueRef a, LLVMValueRef b)
1017 {
1018 LLVMBuilderRef builder = gallivm->builder;
1019 struct lp_build_context bld;
1020 unsigned n;
1021 LLVMValueRef half;
1022 LLVMValueRef ab;
1023
1024 assert(!wide_type.floating);
1025 assert(lp_check_value(wide_type, a));
1026 assert(lp_check_value(wide_type, b));
1027
1028 lp_build_context_init(&bld, gallivm, wide_type);
1029
1030 n = wide_type.width / 2;
1031 if (wide_type.sign) {
1032 --n;
1033 }
1034
1035 /*
1036 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1037 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1038 */
1039
1040 /*
1041 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1042 */
1043
1044 ab = LLVMBuildMul(builder, a, b, "");
1045 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1046
1047 /*
1048 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1049 */
1050
1051 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1052 if (wide_type.sign) {
1053 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1054 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1055 half = lp_build_select(&bld, sign, minus_half, half);
1056 }
1057 ab = LLVMBuildAdd(builder, ab, half, "");
1058
1059 /* Final division */
1060 ab = lp_build_shr_imm(&bld, ab, n);
1061
1062 return ab;
1063 }
1064
1065 /**
1066 * Generate a * b
1067 */
1068 LLVMValueRef
1069 lp_build_mul(struct lp_build_context *bld,
1070 LLVMValueRef a,
1071 LLVMValueRef b)
1072 {
1073 LLVMBuilderRef builder = bld->gallivm->builder;
1074 const struct lp_type type = bld->type;
1075 LLVMValueRef shift;
1076 LLVMValueRef res;
1077
1078 assert(lp_check_value(type, a));
1079 assert(lp_check_value(type, b));
1080
1081 if(a == bld->zero)
1082 return bld->zero;
1083 if(a == bld->one)
1084 return b;
1085 if(b == bld->zero)
1086 return bld->zero;
1087 if(b == bld->one)
1088 return a;
1089 if(a == bld->undef || b == bld->undef)
1090 return bld->undef;
1091
1092 if (!type.floating && !type.fixed && type.norm) {
1093 struct lp_type wide_type = lp_wider_type(type);
1094 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1095
1096 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1097 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1098
1099 /* PMULLW, PSRLW, PADDW */
1100 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1101 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1102
1103 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1104
1105 return ab;
1106 }
1107
1108 if(type.fixed)
1109 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1110 else
1111 shift = NULL;
1112
1113 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1114 if (type.floating)
1115 res = LLVMConstFMul(a, b);
1116 else
1117 res = LLVMConstMul(a, b);
1118 if(shift) {
1119 if(type.sign)
1120 res = LLVMConstAShr(res, shift);
1121 else
1122 res = LLVMConstLShr(res, shift);
1123 }
1124 }
1125 else {
1126 if (type.floating)
1127 res = LLVMBuildFMul(builder, a, b, "");
1128 else
1129 res = LLVMBuildMul(builder, a, b, "");
1130 if(shift) {
1131 if(type.sign)
1132 res = LLVMBuildAShr(builder, res, shift, "");
1133 else
1134 res = LLVMBuildLShr(builder, res, shift, "");
1135 }
1136 }
1137
1138 return res;
1139 }
1140
1141 /*
1142 * Widening mul, valid for 32x32 bit -> 64bit only.
1143 * Result is low 32bits, high bits returned in res_hi.
1144 *
1145 * Emits code that is meant to be compiled for the host CPU.
1146 */
1147 LLVMValueRef
1148 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1149 LLVMValueRef a,
1150 LLVMValueRef b,
1151 LLVMValueRef *res_hi)
1152 {
1153 struct gallivm_state *gallivm = bld->gallivm;
1154 LLVMBuilderRef builder = gallivm->builder;
1155
1156 assert(bld->type.width == 32);
1157 assert(bld->type.floating == 0);
1158 assert(bld->type.fixed == 0);
1159 assert(bld->type.norm == 0);
1160
1161 /*
1162 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1163 * for x86 simd is atrocious (even if the high bits weren't required),
1164 * trying to handle real 64bit inputs (which of course can't happen due
1165 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1166 * apparently llvm does not recognize this widening mul). This includes 6
1167 * (instead of 2) pmuludq plus extra adds and shifts
1168 * The same story applies to signed mul, albeit fixing this requires sse41.
1169 * https://llvm.org/bugs/show_bug.cgi?id=30845
1170 * So, whip up our own code, albeit only for length 4 and 8 (which
1171 * should be good enough)...
1172 */
1173 if ((bld->type.length == 4 || bld->type.length == 8) &&
1174 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1175 util_cpu_caps.has_sse4_1)) {
1176 const char *intrinsic = NULL;
1177 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1178 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1179 struct lp_type type_wide = lp_wider_type(bld->type);
1180 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1181 unsigned i;
1182 for (i = 0; i < bld->type.length; i += 2) {
1183 shuf[i] = lp_build_const_int32(gallivm, i+1);
1184 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1185 }
1186 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1187 aeven = a;
1188 beven = b;
1189 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1190 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1191
1192 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1193 if (bld->type.sign) {
1194 intrinsic = "llvm.x86.avx2.pmul.dq";
1195 } else {
1196 intrinsic = "llvm.x86.avx2.pmulu.dq";
1197 }
1198 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1199 wider_type, aeven, beven);
1200 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1201 wider_type, aodd, bodd);
1202 }
1203 else {
1204 /* for consistent naming look elsewhere... */
1205 if (bld->type.sign) {
1206 intrinsic = "llvm.x86.sse41.pmuldq";
1207 } else {
1208 intrinsic = "llvm.x86.sse2.pmulu.dq";
1209 }
1210 /*
1211 * XXX If we only have AVX but not AVX2 this is a pain.
1212 * lp_build_intrinsic_binary_anylength() can't handle it
1213 * (due to src and dst type not being identical).
1214 */
1215 if (bld->type.length == 8) {
1216 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1217 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1218 LLVMValueRef muleven2[2], mulodd2[2];
1219 struct lp_type type_wide_half = type_wide;
1220 LLVMTypeRef wtype_half;
1221 type_wide_half.length = 2;
1222 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1223 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1224 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1225 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1226 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1227 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1228 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1229 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1230 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1231 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1232 wtype_half, aevenlo, bevenlo);
1233 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1234 wtype_half, aoddlo, boddlo);
1235 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1236 wtype_half, aevenhi, bevenhi);
1237 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1238 wtype_half, aoddhi, boddhi);
1239 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1240 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1241
1242 }
1243 else {
1244 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1245 wider_type, aeven, beven);
1246 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1247 wider_type, aodd, bodd);
1248 }
1249 }
1250 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1251 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1252
1253 for (i = 0; i < bld->type.length; i += 2) {
1254 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1255 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1256 }
1257 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1258 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1259
1260 for (i = 0; i < bld->type.length; i += 2) {
1261 shuf[i] = lp_build_const_int32(gallivm, i);
1262 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1263 }
1264 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1265 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1266 }
1267 else {
1268 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1269 }
1270 }
1271
1272
1273 /*
1274 * Widening mul, valid for 32x32 bit -> 64bit only.
1275 * Result is low 32bits, high bits returned in res_hi.
1276 *
1277 * Emits generic code.
1278 */
1279 LLVMValueRef
1280 lp_build_mul_32_lohi(struct lp_build_context *bld,
1281 LLVMValueRef a,
1282 LLVMValueRef b,
1283 LLVMValueRef *res_hi)
1284 {
1285 struct gallivm_state *gallivm = bld->gallivm;
1286 LLVMBuilderRef builder = gallivm->builder;
1287 LLVMValueRef tmp, shift, res_lo;
1288 struct lp_type type_tmp;
1289 LLVMTypeRef wide_type, narrow_type;
1290
1291 type_tmp = bld->type;
1292 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1293 type_tmp.width *= 2;
1294 wide_type = lp_build_vec_type(gallivm, type_tmp);
1295 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1296
1297 if (bld->type.sign) {
1298 a = LLVMBuildSExt(builder, a, wide_type, "");
1299 b = LLVMBuildSExt(builder, b, wide_type, "");
1300 } else {
1301 a = LLVMBuildZExt(builder, a, wide_type, "");
1302 b = LLVMBuildZExt(builder, b, wide_type, "");
1303 }
1304 tmp = LLVMBuildMul(builder, a, b, "");
1305
1306 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1307
1308 /* Since we truncate anyway, LShr and AShr are equivalent. */
1309 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1310 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1311
1312 return res_lo;
1313 }
1314
1315
1316 /* a * b + c */
1317 LLVMValueRef
1318 lp_build_mad(struct lp_build_context *bld,
1319 LLVMValueRef a,
1320 LLVMValueRef b,
1321 LLVMValueRef c)
1322 {
1323 const struct lp_type type = bld->type;
1324 if (type.floating) {
1325 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1326 } else {
1327 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1328 }
1329 }
1330
1331
1332 /**
1333 * Small vector x scale multiplication optimization.
1334 */
1335 LLVMValueRef
1336 lp_build_mul_imm(struct lp_build_context *bld,
1337 LLVMValueRef a,
1338 int b)
1339 {
1340 LLVMBuilderRef builder = bld->gallivm->builder;
1341 LLVMValueRef factor;
1342
1343 assert(lp_check_value(bld->type, a));
1344
1345 if(b == 0)
1346 return bld->zero;
1347
1348 if(b == 1)
1349 return a;
1350
1351 if(b == -1)
1352 return lp_build_negate(bld, a);
1353
1354 if(b == 2 && bld->type.floating)
1355 return lp_build_add(bld, a, a);
1356
1357 if(util_is_power_of_two_or_zero(b)) {
1358 unsigned shift = ffs(b) - 1;
1359
1360 if(bld->type.floating) {
1361 #if 0
1362 /*
1363 * Power of two multiplication by directly manipulating the exponent.
1364 *
1365 * XXX: This might not be always faster, it will introduce a small error
1366 * for multiplication by zero, and it will produce wrong results
1367 * for Inf and NaN.
1368 */
1369 unsigned mantissa = lp_mantissa(bld->type);
1370 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1371 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1372 a = LLVMBuildAdd(builder, a, factor, "");
1373 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1374 return a;
1375 #endif
1376 }
1377 else {
1378 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1379 return LLVMBuildShl(builder, a, factor, "");
1380 }
1381 }
1382
1383 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1384 return lp_build_mul(bld, a, factor);
1385 }
1386
1387
1388 /**
1389 * Generate a / b
1390 */
1391 LLVMValueRef
1392 lp_build_div(struct lp_build_context *bld,
1393 LLVMValueRef a,
1394 LLVMValueRef b)
1395 {
1396 LLVMBuilderRef builder = bld->gallivm->builder;
1397 const struct lp_type type = bld->type;
1398
1399 assert(lp_check_value(type, a));
1400 assert(lp_check_value(type, b));
1401
1402 if(a == bld->zero)
1403 return bld->zero;
1404 if(a == bld->one && type.floating)
1405 return lp_build_rcp(bld, b);
1406 if(b == bld->zero)
1407 return bld->undef;
1408 if(b == bld->one)
1409 return a;
1410 if(a == bld->undef || b == bld->undef)
1411 return bld->undef;
1412
1413 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1414 if (type.floating)
1415 return LLVMConstFDiv(a, b);
1416 else if (type.sign)
1417 return LLVMConstSDiv(a, b);
1418 else
1419 return LLVMConstUDiv(a, b);
1420 }
1421
1422 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1423 if(FALSE &&
1424 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1425 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1426 type.floating)
1427 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1428
1429 if (type.floating)
1430 return LLVMBuildFDiv(builder, a, b, "");
1431 else if (type.sign)
1432 return LLVMBuildSDiv(builder, a, b, "");
1433 else
1434 return LLVMBuildUDiv(builder, a, b, "");
1435 }
1436
1437
1438 /**
1439 * Linear interpolation helper.
1440 *
1441 * @param normalized whether we are interpolating normalized values,
1442 * encoded in normalized integers, twice as wide.
1443 *
1444 * @sa http://www.stereopsis.com/doubleblend.html
1445 */
1446 static inline LLVMValueRef
1447 lp_build_lerp_simple(struct lp_build_context *bld,
1448 LLVMValueRef x,
1449 LLVMValueRef v0,
1450 LLVMValueRef v1,
1451 unsigned flags)
1452 {
1453 unsigned half_width = bld->type.width/2;
1454 LLVMBuilderRef builder = bld->gallivm->builder;
1455 LLVMValueRef delta;
1456 LLVMValueRef res;
1457
1458 assert(lp_check_value(bld->type, x));
1459 assert(lp_check_value(bld->type, v0));
1460 assert(lp_check_value(bld->type, v1));
1461
1462 delta = lp_build_sub(bld, v1, v0);
1463
1464 if (bld->type.floating) {
1465 assert(flags == 0);
1466 return lp_build_mad(bld, x, delta, v0);
1467 }
1468
1469 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1470 if (!bld->type.sign) {
1471 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1472 /*
1473 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1474 * most-significant-bit to the lowest-significant-bit, so that
1475 * later we can just divide by 2**n instead of 2**n - 1.
1476 */
1477
1478 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1479 }
1480
1481 /* (x * delta) >> n */
1482 res = lp_build_mul(bld, x, delta);
1483 res = lp_build_shr_imm(bld, res, half_width);
1484 } else {
1485 /*
1486 * The rescaling trick above doesn't work for signed numbers, so
1487 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1488 * instead.
1489 */
1490 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1491 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1492 }
1493 } else {
1494 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1495 res = lp_build_mul(bld, x, delta);
1496 }
1497
1498 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1499 /*
1500 * At this point both res and v0 only use the lower half of the bits,
1501 * the rest is zero. Instead of add / mask, do add with half wide type.
1502 */
1503 struct lp_type narrow_type;
1504 struct lp_build_context narrow_bld;
1505
1506 memset(&narrow_type, 0, sizeof narrow_type);
1507 narrow_type.sign = bld->type.sign;
1508 narrow_type.width = bld->type.width/2;
1509 narrow_type.length = bld->type.length*2;
1510
1511 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1512 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1513 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1514 res = lp_build_add(&narrow_bld, v0, res);
1515 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1516 } else {
1517 res = lp_build_add(bld, v0, res);
1518
1519 if (bld->type.fixed) {
1520 /*
1521 * We need to mask out the high order bits when lerping 8bit
1522 * normalized colors stored on 16bits
1523 */
1524 /* XXX: This step is necessary for lerping 8bit colors stored on
1525 * 16bits, but it will be wrong for true fixed point use cases.
1526 * Basically we need a more powerful lp_type, capable of further
1527 * distinguishing the values interpretation from the value storage.
1528 */
1529 LLVMValueRef low_bits;
1530 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1531 res = LLVMBuildAnd(builder, res, low_bits, "");
1532 }
1533 }
1534
1535 return res;
1536 }
1537
1538
1539 /**
1540 * Linear interpolation.
1541 */
1542 LLVMValueRef
1543 lp_build_lerp(struct lp_build_context *bld,
1544 LLVMValueRef x,
1545 LLVMValueRef v0,
1546 LLVMValueRef v1,
1547 unsigned flags)
1548 {
1549 const struct lp_type type = bld->type;
1550 LLVMValueRef res;
1551
1552 assert(lp_check_value(type, x));
1553 assert(lp_check_value(type, v0));
1554 assert(lp_check_value(type, v1));
1555
1556 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1557
1558 if (type.norm) {
1559 struct lp_type wide_type;
1560 struct lp_build_context wide_bld;
1561 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1562
1563 assert(type.length >= 2);
1564
1565 /*
1566 * Create a wider integer type, enough to hold the
1567 * intermediate result of the multiplication.
1568 */
1569 memset(&wide_type, 0, sizeof wide_type);
1570 wide_type.sign = type.sign;
1571 wide_type.width = type.width*2;
1572 wide_type.length = type.length/2;
1573
1574 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1575
1576 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1577 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1578 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1579
1580 /*
1581 * Lerp both halves.
1582 */
1583
1584 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1585
1586 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1587 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1588
1589 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1590 } else {
1591 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1592 }
1593
1594 return res;
1595 }
1596
1597
1598 /**
1599 * Bilinear interpolation.
1600 *
1601 * Values indices are in v_{yx}.
1602 */
1603 LLVMValueRef
1604 lp_build_lerp_2d(struct lp_build_context *bld,
1605 LLVMValueRef x,
1606 LLVMValueRef y,
1607 LLVMValueRef v00,
1608 LLVMValueRef v01,
1609 LLVMValueRef v10,
1610 LLVMValueRef v11,
1611 unsigned flags)
1612 {
1613 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1614 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1615 return lp_build_lerp(bld, y, v0, v1, flags);
1616 }
1617
1618
1619 LLVMValueRef
1620 lp_build_lerp_3d(struct lp_build_context *bld,
1621 LLVMValueRef x,
1622 LLVMValueRef y,
1623 LLVMValueRef z,
1624 LLVMValueRef v000,
1625 LLVMValueRef v001,
1626 LLVMValueRef v010,
1627 LLVMValueRef v011,
1628 LLVMValueRef v100,
1629 LLVMValueRef v101,
1630 LLVMValueRef v110,
1631 LLVMValueRef v111,
1632 unsigned flags)
1633 {
1634 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1635 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1636 return lp_build_lerp(bld, z, v0, v1, flags);
1637 }
1638
1639
1640 /**
1641 * Generate min(a, b)
1642 * Do checks for special cases but not for nans.
1643 */
1644 LLVMValueRef
1645 lp_build_min(struct lp_build_context *bld,
1646 LLVMValueRef a,
1647 LLVMValueRef b)
1648 {
1649 assert(lp_check_value(bld->type, a));
1650 assert(lp_check_value(bld->type, b));
1651
1652 if(a == bld->undef || b == bld->undef)
1653 return bld->undef;
1654
1655 if(a == b)
1656 return a;
1657
1658 if (bld->type.norm) {
1659 if (!bld->type.sign) {
1660 if (a == bld->zero || b == bld->zero) {
1661 return bld->zero;
1662 }
1663 }
1664 if(a == bld->one)
1665 return b;
1666 if(b == bld->one)
1667 return a;
1668 }
1669
1670 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1671 }
1672
1673
1674 /**
1675 * Generate min(a, b)
1676 * NaN's are handled according to the behavior specified by the
1677 * nan_behavior argument.
1678 */
1679 LLVMValueRef
1680 lp_build_min_ext(struct lp_build_context *bld,
1681 LLVMValueRef a,
1682 LLVMValueRef b,
1683 enum gallivm_nan_behavior nan_behavior)
1684 {
1685 assert(lp_check_value(bld->type, a));
1686 assert(lp_check_value(bld->type, b));
1687
1688 if(a == bld->undef || b == bld->undef)
1689 return bld->undef;
1690
1691 if(a == b)
1692 return a;
1693
1694 if (bld->type.norm) {
1695 if (!bld->type.sign) {
1696 if (a == bld->zero || b == bld->zero) {
1697 return bld->zero;
1698 }
1699 }
1700 if(a == bld->one)
1701 return b;
1702 if(b == bld->one)
1703 return a;
1704 }
1705
1706 return lp_build_min_simple(bld, a, b, nan_behavior);
1707 }
1708
1709 /**
1710 * Generate max(a, b)
1711 * Do checks for special cases, but NaN behavior is undefined.
1712 */
1713 LLVMValueRef
1714 lp_build_max(struct lp_build_context *bld,
1715 LLVMValueRef a,
1716 LLVMValueRef b)
1717 {
1718 assert(lp_check_value(bld->type, a));
1719 assert(lp_check_value(bld->type, b));
1720
1721 if(a == bld->undef || b == bld->undef)
1722 return bld->undef;
1723
1724 if(a == b)
1725 return a;
1726
1727 if(bld->type.norm) {
1728 if(a == bld->one || b == bld->one)
1729 return bld->one;
1730 if (!bld->type.sign) {
1731 if (a == bld->zero) {
1732 return b;
1733 }
1734 if (b == bld->zero) {
1735 return a;
1736 }
1737 }
1738 }
1739
1740 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1741 }
1742
1743
1744 /**
1745 * Generate max(a, b)
1746 * Checks for special cases.
1747 * NaN's are handled according to the behavior specified by the
1748 * nan_behavior argument.
1749 */
1750 LLVMValueRef
1751 lp_build_max_ext(struct lp_build_context *bld,
1752 LLVMValueRef a,
1753 LLVMValueRef b,
1754 enum gallivm_nan_behavior nan_behavior)
1755 {
1756 assert(lp_check_value(bld->type, a));
1757 assert(lp_check_value(bld->type, b));
1758
1759 if(a == bld->undef || b == bld->undef)
1760 return bld->undef;
1761
1762 if(a == b)
1763 return a;
1764
1765 if(bld->type.norm) {
1766 if(a == bld->one || b == bld->one)
1767 return bld->one;
1768 if (!bld->type.sign) {
1769 if (a == bld->zero) {
1770 return b;
1771 }
1772 if (b == bld->zero) {
1773 return a;
1774 }
1775 }
1776 }
1777
1778 return lp_build_max_simple(bld, a, b, nan_behavior);
1779 }
1780
1781 /**
1782 * Generate clamp(a, min, max)
1783 * NaN behavior (for any of a, min, max) is undefined.
1784 * Do checks for special cases.
1785 */
1786 LLVMValueRef
1787 lp_build_clamp(struct lp_build_context *bld,
1788 LLVMValueRef a,
1789 LLVMValueRef min,
1790 LLVMValueRef max)
1791 {
1792 assert(lp_check_value(bld->type, a));
1793 assert(lp_check_value(bld->type, min));
1794 assert(lp_check_value(bld->type, max));
1795
1796 a = lp_build_min(bld, a, max);
1797 a = lp_build_max(bld, a, min);
1798 return a;
1799 }
1800
1801
1802 /**
1803 * Generate clamp(a, 0, 1)
1804 * A NaN will get converted to zero.
1805 */
1806 LLVMValueRef
1807 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1808 LLVMValueRef a)
1809 {
1810 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1811 a = lp_build_min(bld, a, bld->one);
1812 return a;
1813 }
1814
1815
1816 /**
1817 * Generate abs(a)
1818 */
1819 LLVMValueRef
1820 lp_build_abs(struct lp_build_context *bld,
1821 LLVMValueRef a)
1822 {
1823 LLVMBuilderRef builder = bld->gallivm->builder;
1824 const struct lp_type type = bld->type;
1825 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1826
1827 assert(lp_check_value(type, a));
1828
1829 if(!type.sign)
1830 return a;
1831
1832 if(type.floating) {
1833 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1834 /* Workaround llvm.org/PR27332 */
1835 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1836 unsigned long long absMask = ~(1ULL << (type.width - 1));
1837 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1838 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1839 a = LLVMBuildAnd(builder, a, mask, "");
1840 a = LLVMBuildBitCast(builder, a, vec_type, "");
1841 return a;
1842 } else {
1843 char intrinsic[32];
1844 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1845 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1846 }
1847 }
1848
1849 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1850 switch(type.width) {
1851 case 8:
1852 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1853 case 16:
1854 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1855 case 32:
1856 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1857 }
1858 }
1859 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1860 switch(type.width) {
1861 case 8:
1862 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1863 case 16:
1864 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1865 case 32:
1866 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1867 }
1868 }
1869
1870 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1871 a, LLVMBuildNeg(builder, a, ""));
1872 }
1873
1874
1875 LLVMValueRef
1876 lp_build_negate(struct lp_build_context *bld,
1877 LLVMValueRef a)
1878 {
1879 LLVMBuilderRef builder = bld->gallivm->builder;
1880
1881 assert(lp_check_value(bld->type, a));
1882
1883 if (bld->type.floating)
1884 a = LLVMBuildFNeg(builder, a, "");
1885 else
1886 a = LLVMBuildNeg(builder, a, "");
1887
1888 return a;
1889 }
1890
1891
1892 /** Return -1, 0 or +1 depending on the sign of a */
1893 LLVMValueRef
1894 lp_build_sgn(struct lp_build_context *bld,
1895 LLVMValueRef a)
1896 {
1897 LLVMBuilderRef builder = bld->gallivm->builder;
1898 const struct lp_type type = bld->type;
1899 LLVMValueRef cond;
1900 LLVMValueRef res;
1901
1902 assert(lp_check_value(type, a));
1903
1904 /* Handle non-zero case */
1905 if(!type.sign) {
1906 /* if not zero then sign must be positive */
1907 res = bld->one;
1908 }
1909 else if(type.floating) {
1910 LLVMTypeRef vec_type;
1911 LLVMTypeRef int_type;
1912 LLVMValueRef mask;
1913 LLVMValueRef sign;
1914 LLVMValueRef one;
1915 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1916
1917 int_type = lp_build_int_vec_type(bld->gallivm, type);
1918 vec_type = lp_build_vec_type(bld->gallivm, type);
1919 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1920
1921 /* Take the sign bit and add it to 1 constant */
1922 sign = LLVMBuildBitCast(builder, a, int_type, "");
1923 sign = LLVMBuildAnd(builder, sign, mask, "");
1924 one = LLVMConstBitCast(bld->one, int_type);
1925 res = LLVMBuildOr(builder, sign, one, "");
1926 res = LLVMBuildBitCast(builder, res, vec_type, "");
1927 }
1928 else
1929 {
1930 /* signed int/norm/fixed point */
1931 /* could use psign with sse3 and appropriate vectors here */
1932 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1933 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1934 res = lp_build_select(bld, cond, bld->one, minus_one);
1935 }
1936
1937 /* Handle zero */
1938 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1939 res = lp_build_select(bld, cond, bld->zero, res);
1940
1941 return res;
1942 }
1943
1944
1945 /**
1946 * Set the sign of float vector 'a' according to 'sign'.
1947 * If sign==0, return abs(a).
1948 * If sign==1, return -abs(a);
1949 * Other values for sign produce undefined results.
1950 */
1951 LLVMValueRef
1952 lp_build_set_sign(struct lp_build_context *bld,
1953 LLVMValueRef a, LLVMValueRef sign)
1954 {
1955 LLVMBuilderRef builder = bld->gallivm->builder;
1956 const struct lp_type type = bld->type;
1957 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1958 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1959 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1960 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1961 ~((unsigned long long) 1 << (type.width - 1)));
1962 LLVMValueRef val, res;
1963
1964 assert(type.floating);
1965 assert(lp_check_value(type, a));
1966
1967 /* val = reinterpret_cast<int>(a) */
1968 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1969 /* val = val & mask */
1970 val = LLVMBuildAnd(builder, val, mask, "");
1971 /* sign = sign << shift */
1972 sign = LLVMBuildShl(builder, sign, shift, "");
1973 /* res = val | sign */
1974 res = LLVMBuildOr(builder, val, sign, "");
1975 /* res = reinterpret_cast<float>(res) */
1976 res = LLVMBuildBitCast(builder, res, vec_type, "");
1977
1978 return res;
1979 }
1980
1981
1982 /**
1983 * Convert vector of (or scalar) int to vector of (or scalar) float.
1984 */
1985 LLVMValueRef
1986 lp_build_int_to_float(struct lp_build_context *bld,
1987 LLVMValueRef a)
1988 {
1989 LLVMBuilderRef builder = bld->gallivm->builder;
1990 const struct lp_type type = bld->type;
1991 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1992
1993 assert(type.floating);
1994
1995 return LLVMBuildSIToFP(builder, a, vec_type, "");
1996 }
1997
1998 static boolean
1999 arch_rounding_available(const struct lp_type type)
2000 {
2001 if ((util_cpu_caps.has_sse4_1 &&
2002 (type.length == 1 || type.width*type.length == 128)) ||
2003 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
2004 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
2005 return TRUE;
2006 else if ((util_cpu_caps.has_altivec &&
2007 (type.width == 32 && type.length == 4)))
2008 return TRUE;
2009 else if (util_cpu_caps.has_neon)
2010 return TRUE;
2011
2012 return FALSE;
2013 }
2014
2015 enum lp_build_round_mode
2016 {
2017 LP_BUILD_ROUND_NEAREST = 0,
2018 LP_BUILD_ROUND_FLOOR = 1,
2019 LP_BUILD_ROUND_CEIL = 2,
2020 LP_BUILD_ROUND_TRUNCATE = 3
2021 };
2022
2023 static inline LLVMValueRef
2024 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2025 LLVMValueRef a)
2026 {
2027 LLVMBuilderRef builder = bld->gallivm->builder;
2028 const struct lp_type type = bld->type;
2029 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2030 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2031 const char *intrinsic;
2032 LLVMValueRef res;
2033
2034 assert(type.floating);
2035 /* using the double precision conversions is a bit more complicated */
2036 assert(type.width == 32);
2037
2038 assert(lp_check_value(type, a));
2039 assert(util_cpu_caps.has_sse2);
2040
2041 /* This is relying on MXCSR rounding mode, which should always be nearest. */
2042 if (type.length == 1) {
2043 LLVMTypeRef vec_type;
2044 LLVMValueRef undef;
2045 LLVMValueRef arg;
2046 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2047
2048 vec_type = LLVMVectorType(bld->elem_type, 4);
2049
2050 intrinsic = "llvm.x86.sse.cvtss2si";
2051
2052 undef = LLVMGetUndef(vec_type);
2053
2054 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2055
2056 res = lp_build_intrinsic_unary(builder, intrinsic,
2057 ret_type, arg);
2058 }
2059 else {
2060 if (type.width* type.length == 128) {
2061 intrinsic = "llvm.x86.sse2.cvtps2dq";
2062 }
2063 else {
2064 assert(type.width*type.length == 256);
2065 assert(util_cpu_caps.has_avx);
2066
2067 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2068 }
2069 res = lp_build_intrinsic_unary(builder, intrinsic,
2070 ret_type, a);
2071 }
2072
2073 return res;
2074 }
2075
2076
2077 /*
2078 */
2079 static inline LLVMValueRef
2080 lp_build_round_altivec(struct lp_build_context *bld,
2081 LLVMValueRef a,
2082 enum lp_build_round_mode mode)
2083 {
2084 LLVMBuilderRef builder = bld->gallivm->builder;
2085 const struct lp_type type = bld->type;
2086 const char *intrinsic = NULL;
2087
2088 assert(type.floating);
2089
2090 assert(lp_check_value(type, a));
2091 assert(util_cpu_caps.has_altivec);
2092
2093 (void)type;
2094
2095 switch (mode) {
2096 case LP_BUILD_ROUND_NEAREST:
2097 intrinsic = "llvm.ppc.altivec.vrfin";
2098 break;
2099 case LP_BUILD_ROUND_FLOOR:
2100 intrinsic = "llvm.ppc.altivec.vrfim";
2101 break;
2102 case LP_BUILD_ROUND_CEIL:
2103 intrinsic = "llvm.ppc.altivec.vrfip";
2104 break;
2105 case LP_BUILD_ROUND_TRUNCATE:
2106 intrinsic = "llvm.ppc.altivec.vrfiz";
2107 break;
2108 }
2109
2110 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2111 }
2112
2113 static inline LLVMValueRef
2114 lp_build_round_arch(struct lp_build_context *bld,
2115 LLVMValueRef a,
2116 enum lp_build_round_mode mode)
2117 {
2118 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2119 LLVMBuilderRef builder = bld->gallivm->builder;
2120 const struct lp_type type = bld->type;
2121 const char *intrinsic_root;
2122 char intrinsic[32];
2123
2124 assert(type.floating);
2125 assert(lp_check_value(type, a));
2126 (void)type;
2127
2128 switch (mode) {
2129 case LP_BUILD_ROUND_NEAREST:
2130 intrinsic_root = "llvm.nearbyint";
2131 break;
2132 case LP_BUILD_ROUND_FLOOR:
2133 intrinsic_root = "llvm.floor";
2134 break;
2135 case LP_BUILD_ROUND_CEIL:
2136 intrinsic_root = "llvm.ceil";
2137 break;
2138 case LP_BUILD_ROUND_TRUNCATE:
2139 intrinsic_root = "llvm.trunc";
2140 break;
2141 }
2142
2143 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2144 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2145 }
2146 else /* (util_cpu_caps.has_altivec) */
2147 return lp_build_round_altivec(bld, a, mode);
2148 }
2149
2150 /**
2151 * Return the integer part of a float (vector) value (== round toward zero).
2152 * The returned value is a float (vector).
2153 * Ex: trunc(-1.5) = -1.0
2154 */
2155 LLVMValueRef
2156 lp_build_trunc(struct lp_build_context *bld,
2157 LLVMValueRef a)
2158 {
2159 LLVMBuilderRef builder = bld->gallivm->builder;
2160 const struct lp_type type = bld->type;
2161
2162 assert(type.floating);
2163 assert(lp_check_value(type, a));
2164
2165 if (arch_rounding_available(type)) {
2166 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2167 }
2168 else {
2169 const struct lp_type type = bld->type;
2170 struct lp_type inttype;
2171 struct lp_build_context intbld;
2172 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2173 LLVMValueRef trunc, res, anosign, mask;
2174 LLVMTypeRef int_vec_type = bld->int_vec_type;
2175 LLVMTypeRef vec_type = bld->vec_type;
2176
2177 assert(type.width == 32); /* might want to handle doubles at some point */
2178
2179 inttype = type;
2180 inttype.floating = 0;
2181 lp_build_context_init(&intbld, bld->gallivm, inttype);
2182
2183 /* round by truncation */
2184 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2185 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2186
2187 /* mask out sign bit */
2188 anosign = lp_build_abs(bld, a);
2189 /*
2190 * mask out all values if anosign > 2^24
2191 * This should work both for large ints (all rounding is no-op for them
2192 * because such floats are always exact) as well as special cases like
2193 * NaNs, Infs (taking advantage of the fact they use max exponent).
2194 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2195 */
2196 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2197 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2198 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2199 return lp_build_select(bld, mask, a, res);
2200 }
2201 }
2202
2203
2204 /**
2205 * Return float (vector) rounded to nearest integer (vector). The returned
2206 * value is a float (vector).
2207 * Ex: round(0.9) = 1.0
2208 * Ex: round(-1.5) = -2.0
2209 */
2210 LLVMValueRef
2211 lp_build_round(struct lp_build_context *bld,
2212 LLVMValueRef a)
2213 {
2214 LLVMBuilderRef builder = bld->gallivm->builder;
2215 const struct lp_type type = bld->type;
2216
2217 assert(type.floating);
2218 assert(lp_check_value(type, a));
2219
2220 if (arch_rounding_available(type)) {
2221 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2222 }
2223 else {
2224 const struct lp_type type = bld->type;
2225 struct lp_type inttype;
2226 struct lp_build_context intbld;
2227 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2228 LLVMValueRef res, anosign, mask;
2229 LLVMTypeRef int_vec_type = bld->int_vec_type;
2230 LLVMTypeRef vec_type = bld->vec_type;
2231
2232 assert(type.width == 32); /* might want to handle doubles at some point */
2233
2234 inttype = type;
2235 inttype.floating = 0;
2236 lp_build_context_init(&intbld, bld->gallivm, inttype);
2237
2238 res = lp_build_iround(bld, a);
2239 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2240
2241 /* mask out sign bit */
2242 anosign = lp_build_abs(bld, a);
2243 /*
2244 * mask out all values if anosign > 2^24
2245 * This should work both for large ints (all rounding is no-op for them
2246 * because such floats are always exact) as well as special cases like
2247 * NaNs, Infs (taking advantage of the fact they use max exponent).
2248 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2249 */
2250 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2251 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2252 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2253 return lp_build_select(bld, mask, a, res);
2254 }
2255 }
2256
2257
2258 /**
2259 * Return floor of float (vector), result is a float (vector)
2260 * Ex: floor(1.1) = 1.0
2261 * Ex: floor(-1.1) = -2.0
2262 */
2263 LLVMValueRef
2264 lp_build_floor(struct lp_build_context *bld,
2265 LLVMValueRef a)
2266 {
2267 LLVMBuilderRef builder = bld->gallivm->builder;
2268 const struct lp_type type = bld->type;
2269
2270 assert(type.floating);
2271 assert(lp_check_value(type, a));
2272
2273 if (arch_rounding_available(type)) {
2274 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2275 }
2276 else {
2277 const struct lp_type type = bld->type;
2278 struct lp_type inttype;
2279 struct lp_build_context intbld;
2280 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2281 LLVMValueRef trunc, res, anosign, mask;
2282 LLVMTypeRef int_vec_type = bld->int_vec_type;
2283 LLVMTypeRef vec_type = bld->vec_type;
2284
2285 if (type.width != 32) {
2286 char intrinsic[32];
2287 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2288 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2289 }
2290
2291 assert(type.width == 32); /* might want to handle doubles at some point */
2292
2293 inttype = type;
2294 inttype.floating = 0;
2295 lp_build_context_init(&intbld, bld->gallivm, inttype);
2296
2297 /* round by truncation */
2298 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2299 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2300
2301 if (type.sign) {
2302 LLVMValueRef tmp;
2303
2304 /*
2305 * fix values if rounding is wrong (for non-special cases)
2306 * - this is the case if trunc > a
2307 */
2308 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2309 /* tmp = trunc > a ? 1.0 : 0.0 */
2310 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2311 tmp = lp_build_and(&intbld, mask, tmp);
2312 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2313 res = lp_build_sub(bld, res, tmp);
2314 }
2315
2316 /* mask out sign bit */
2317 anosign = lp_build_abs(bld, a);
2318 /*
2319 * mask out all values if anosign > 2^24
2320 * This should work both for large ints (all rounding is no-op for them
2321 * because such floats are always exact) as well as special cases like
2322 * NaNs, Infs (taking advantage of the fact they use max exponent).
2323 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2324 */
2325 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2326 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2327 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2328 return lp_build_select(bld, mask, a, res);
2329 }
2330 }
2331
2332
2333 /**
2334 * Return ceiling of float (vector), returning float (vector).
2335 * Ex: ceil( 1.1) = 2.0
2336 * Ex: ceil(-1.1) = -1.0
2337 */
2338 LLVMValueRef
2339 lp_build_ceil(struct lp_build_context *bld,
2340 LLVMValueRef a)
2341 {
2342 LLVMBuilderRef builder = bld->gallivm->builder;
2343 const struct lp_type type = bld->type;
2344
2345 assert(type.floating);
2346 assert(lp_check_value(type, a));
2347
2348 if (arch_rounding_available(type)) {
2349 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2350 }
2351 else {
2352 const struct lp_type type = bld->type;
2353 struct lp_type inttype;
2354 struct lp_build_context intbld;
2355 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2356 LLVMValueRef trunc, res, anosign, mask, tmp;
2357 LLVMTypeRef int_vec_type = bld->int_vec_type;
2358 LLVMTypeRef vec_type = bld->vec_type;
2359
2360 if (type.width != 32) {
2361 char intrinsic[32];
2362 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2363 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2364 }
2365
2366 assert(type.width == 32); /* might want to handle doubles at some point */
2367
2368 inttype = type;
2369 inttype.floating = 0;
2370 lp_build_context_init(&intbld, bld->gallivm, inttype);
2371
2372 /* round by truncation */
2373 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2374 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2375
2376 /*
2377 * fix values if rounding is wrong (for non-special cases)
2378 * - this is the case if trunc < a
2379 */
2380 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2381 /* tmp = trunc < a ? 1.0 : 0.0 */
2382 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2383 tmp = lp_build_and(&intbld, mask, tmp);
2384 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2385 res = lp_build_add(bld, trunc, tmp);
2386
2387 /* mask out sign bit */
2388 anosign = lp_build_abs(bld, a);
2389 /*
2390 * mask out all values if anosign > 2^24
2391 * This should work both for large ints (all rounding is no-op for them
2392 * because such floats are always exact) as well as special cases like
2393 * NaNs, Infs (taking advantage of the fact they use max exponent).
2394 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2395 */
2396 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2397 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2398 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2399 return lp_build_select(bld, mask, a, res);
2400 }
2401 }
2402
2403
2404 /**
2405 * Return fractional part of 'a' computed as a - floor(a)
2406 * Typically used in texture coord arithmetic.
2407 */
2408 LLVMValueRef
2409 lp_build_fract(struct lp_build_context *bld,
2410 LLVMValueRef a)
2411 {
2412 assert(bld->type.floating);
2413 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2414 }
2415
2416
2417 /**
2418 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2419 * against 0.99999(9). (Will also return that value for NaNs.)
2420 */
2421 static inline LLVMValueRef
2422 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2423 {
2424 LLVMValueRef max;
2425
2426 /* this is the largest number smaller than 1.0 representable as float */
2427 max = lp_build_const_vec(bld->gallivm, bld->type,
2428 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2429 return lp_build_min_ext(bld, fract, max,
2430 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2431 }
2432
2433
2434 /**
2435 * Same as lp_build_fract, but guarantees that the result is always smaller
2436 * than one. Will also return the smaller-than-one value for infs, NaNs.
2437 */
2438 LLVMValueRef
2439 lp_build_fract_safe(struct lp_build_context *bld,
2440 LLVMValueRef a)
2441 {
2442 return clamp_fract(bld, lp_build_fract(bld, a));
2443 }
2444
2445
2446 /**
2447 * Return the integer part of a float (vector) value (== round toward zero).
2448 * The returned value is an integer (vector).
2449 * Ex: itrunc(-1.5) = -1
2450 */
2451 LLVMValueRef
2452 lp_build_itrunc(struct lp_build_context *bld,
2453 LLVMValueRef a)
2454 {
2455 LLVMBuilderRef builder = bld->gallivm->builder;
2456 const struct lp_type type = bld->type;
2457 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2458
2459 assert(type.floating);
2460 assert(lp_check_value(type, a));
2461
2462 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2463 }
2464
2465
2466 /**
2467 * Return float (vector) rounded to nearest integer (vector). The returned
2468 * value is an integer (vector).
2469 * Ex: iround(0.9) = 1
2470 * Ex: iround(-1.5) = -2
2471 */
2472 LLVMValueRef
2473 lp_build_iround(struct lp_build_context *bld,
2474 LLVMValueRef a)
2475 {
2476 LLVMBuilderRef builder = bld->gallivm->builder;
2477 const struct lp_type type = bld->type;
2478 LLVMTypeRef int_vec_type = bld->int_vec_type;
2479 LLVMValueRef res;
2480
2481 assert(type.floating);
2482
2483 assert(lp_check_value(type, a));
2484
2485 if ((util_cpu_caps.has_sse2 &&
2486 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2487 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2488 return lp_build_iround_nearest_sse2(bld, a);
2489 }
2490 if (arch_rounding_available(type)) {
2491 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2492 }
2493 else {
2494 LLVMValueRef half;
2495
2496 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2497
2498 if (type.sign) {
2499 LLVMTypeRef vec_type = bld->vec_type;
2500 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2501 (unsigned long long)1 << (type.width - 1));
2502 LLVMValueRef sign;
2503
2504 /* get sign bit */
2505 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2506 sign = LLVMBuildAnd(builder, sign, mask, "");
2507
2508 /* sign * 0.5 */
2509 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2510 half = LLVMBuildOr(builder, sign, half, "");
2511 half = LLVMBuildBitCast(builder, half, vec_type, "");
2512 }
2513
2514 res = LLVMBuildFAdd(builder, a, half, "");
2515 }
2516
2517 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2518
2519 return res;
2520 }
2521
2522
2523 /**
2524 * Return floor of float (vector), result is an int (vector)
2525 * Ex: ifloor(1.1) = 1.0
2526 * Ex: ifloor(-1.1) = -2.0
2527 */
2528 LLVMValueRef
2529 lp_build_ifloor(struct lp_build_context *bld,
2530 LLVMValueRef a)
2531 {
2532 LLVMBuilderRef builder = bld->gallivm->builder;
2533 const struct lp_type type = bld->type;
2534 LLVMTypeRef int_vec_type = bld->int_vec_type;
2535 LLVMValueRef res;
2536
2537 assert(type.floating);
2538 assert(lp_check_value(type, a));
2539
2540 res = a;
2541 if (type.sign) {
2542 if (arch_rounding_available(type)) {
2543 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2544 }
2545 else {
2546 struct lp_type inttype;
2547 struct lp_build_context intbld;
2548 LLVMValueRef trunc, itrunc, mask;
2549
2550 assert(type.floating);
2551 assert(lp_check_value(type, a));
2552
2553 inttype = type;
2554 inttype.floating = 0;
2555 lp_build_context_init(&intbld, bld->gallivm, inttype);
2556
2557 /* round by truncation */
2558 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2559 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2560
2561 /*
2562 * fix values if rounding is wrong (for non-special cases)
2563 * - this is the case if trunc > a
2564 * The results of doing this with NaNs, very large values etc.
2565 * are undefined but this seems to be the case anyway.
2566 */
2567 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2568 /* cheapie minus one with mask since the mask is minus one / zero */
2569 return lp_build_add(&intbld, itrunc, mask);
2570 }
2571 }
2572
2573 /* round to nearest (toward zero) */
2574 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2575
2576 return res;
2577 }
2578
2579
2580 /**
2581 * Return ceiling of float (vector), returning int (vector).
2582 * Ex: iceil( 1.1) = 2
2583 * Ex: iceil(-1.1) = -1
2584 */
2585 LLVMValueRef
2586 lp_build_iceil(struct lp_build_context *bld,
2587 LLVMValueRef a)
2588 {
2589 LLVMBuilderRef builder = bld->gallivm->builder;
2590 const struct lp_type type = bld->type;
2591 LLVMTypeRef int_vec_type = bld->int_vec_type;
2592 LLVMValueRef res;
2593
2594 assert(type.floating);
2595 assert(lp_check_value(type, a));
2596
2597 if (arch_rounding_available(type)) {
2598 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2599 }
2600 else {
2601 struct lp_type inttype;
2602 struct lp_build_context intbld;
2603 LLVMValueRef trunc, itrunc, mask;
2604
2605 assert(type.floating);
2606 assert(lp_check_value(type, a));
2607
2608 inttype = type;
2609 inttype.floating = 0;
2610 lp_build_context_init(&intbld, bld->gallivm, inttype);
2611
2612 /* round by truncation */
2613 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2614 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2615
2616 /*
2617 * fix values if rounding is wrong (for non-special cases)
2618 * - this is the case if trunc < a
2619 * The results of doing this with NaNs, very large values etc.
2620 * are undefined but this seems to be the case anyway.
2621 */
2622 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2623 /* cheapie plus one with mask since the mask is minus one / zero */
2624 return lp_build_sub(&intbld, itrunc, mask);
2625 }
2626
2627 /* round to nearest (toward zero) */
2628 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2629
2630 return res;
2631 }
2632
2633
2634 /**
2635 * Combined ifloor() & fract().
2636 *
2637 * Preferred to calling the functions separately, as it will ensure that the
2638 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2639 */
2640 void
2641 lp_build_ifloor_fract(struct lp_build_context *bld,
2642 LLVMValueRef a,
2643 LLVMValueRef *out_ipart,
2644 LLVMValueRef *out_fpart)
2645 {
2646 LLVMBuilderRef builder = bld->gallivm->builder;
2647 const struct lp_type type = bld->type;
2648 LLVMValueRef ipart;
2649
2650 assert(type.floating);
2651 assert(lp_check_value(type, a));
2652
2653 if (arch_rounding_available(type)) {
2654 /*
2655 * floor() is easier.
2656 */
2657
2658 ipart = lp_build_floor(bld, a);
2659 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2660 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2661 }
2662 else {
2663 /*
2664 * ifloor() is easier.
2665 */
2666
2667 *out_ipart = lp_build_ifloor(bld, a);
2668 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2669 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2670 }
2671 }
2672
2673
2674 /**
2675 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2676 * always smaller than one.
2677 */
2678 void
2679 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2680 LLVMValueRef a,
2681 LLVMValueRef *out_ipart,
2682 LLVMValueRef *out_fpart)
2683 {
2684 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2685 *out_fpart = clamp_fract(bld, *out_fpart);
2686 }
2687
2688
2689 LLVMValueRef
2690 lp_build_sqrt(struct lp_build_context *bld,
2691 LLVMValueRef a)
2692 {
2693 LLVMBuilderRef builder = bld->gallivm->builder;
2694 const struct lp_type type = bld->type;
2695 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2696 char intrinsic[32];
2697
2698 assert(lp_check_value(type, a));
2699
2700 assert(type.floating);
2701 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2702
2703 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2704 }
2705
2706
2707 /**
2708 * Do one Newton-Raphson step to improve reciprocate precision:
2709 *
2710 * x_{i+1} = x_i * (2 - a * x_i)
2711 *
2712 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2713 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2714 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2715 * halo. It would be necessary to clamp the argument to prevent this.
2716 *
2717 * See also:
2718 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2719 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2720 */
2721 static inline LLVMValueRef
2722 lp_build_rcp_refine(struct lp_build_context *bld,
2723 LLVMValueRef a,
2724 LLVMValueRef rcp_a)
2725 {
2726 LLVMBuilderRef builder = bld->gallivm->builder;
2727 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2728 LLVMValueRef res;
2729
2730 res = LLVMBuildFMul(builder, a, rcp_a, "");
2731 res = LLVMBuildFSub(builder, two, res, "");
2732 res = LLVMBuildFMul(builder, rcp_a, res, "");
2733
2734 return res;
2735 }
2736
2737
2738 LLVMValueRef
2739 lp_build_rcp(struct lp_build_context *bld,
2740 LLVMValueRef a)
2741 {
2742 LLVMBuilderRef builder = bld->gallivm->builder;
2743 const struct lp_type type = bld->type;
2744
2745 assert(lp_check_value(type, a));
2746
2747 if(a == bld->zero)
2748 return bld->undef;
2749 if(a == bld->one)
2750 return bld->one;
2751 if(a == bld->undef)
2752 return bld->undef;
2753
2754 assert(type.floating);
2755
2756 if(LLVMIsConstant(a))
2757 return LLVMConstFDiv(bld->one, a);
2758
2759 /*
2760 * We don't use RCPPS because:
2761 * - it only has 10bits of precision
2762 * - it doesn't even get the reciprocate of 1.0 exactly
2763 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2764 * - for recent processors the benefit over DIVPS is marginal, a case
2765 * dependent
2766 *
2767 * We could still use it on certain processors if benchmarks show that the
2768 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2769 * particular uses that require less workarounds.
2770 */
2771
2772 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2773 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2774 const unsigned num_iterations = 0;
2775 LLVMValueRef res;
2776 unsigned i;
2777 const char *intrinsic = NULL;
2778
2779 if (type.length == 4) {
2780 intrinsic = "llvm.x86.sse.rcp.ps";
2781 }
2782 else {
2783 intrinsic = "llvm.x86.avx.rcp.ps.256";
2784 }
2785
2786 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2787
2788 for (i = 0; i < num_iterations; ++i) {
2789 res = lp_build_rcp_refine(bld, a, res);
2790 }
2791
2792 return res;
2793 }
2794
2795 return LLVMBuildFDiv(builder, bld->one, a, "");
2796 }
2797
2798
2799 /**
2800 * Do one Newton-Raphson step to improve rsqrt precision:
2801 *
2802 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2803 *
2804 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2805 */
2806 static inline LLVMValueRef
2807 lp_build_rsqrt_refine(struct lp_build_context *bld,
2808 LLVMValueRef a,
2809 LLVMValueRef rsqrt_a)
2810 {
2811 LLVMBuilderRef builder = bld->gallivm->builder;
2812 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2813 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2814 LLVMValueRef res;
2815
2816 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2817 res = LLVMBuildFMul(builder, a, res, "");
2818 res = LLVMBuildFSub(builder, three, res, "");
2819 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2820 res = LLVMBuildFMul(builder, half, res, "");
2821
2822 return res;
2823 }
2824
2825
2826 /**
2827 * Generate 1/sqrt(a).
2828 * Result is undefined for values < 0, infinity for +0.
2829 */
2830 LLVMValueRef
2831 lp_build_rsqrt(struct lp_build_context *bld,
2832 LLVMValueRef a)
2833 {
2834 const struct lp_type type = bld->type;
2835
2836 assert(lp_check_value(type, a));
2837
2838 assert(type.floating);
2839
2840 /*
2841 * This should be faster but all denormals will end up as infinity.
2842 */
2843 if (0 && lp_build_fast_rsqrt_available(type)) {
2844 const unsigned num_iterations = 1;
2845 LLVMValueRef res;
2846 unsigned i;
2847
2848 /* rsqrt(1.0) != 1.0 here */
2849 res = lp_build_fast_rsqrt(bld, a);
2850
2851 if (num_iterations) {
2852 /*
2853 * Newton-Raphson will result in NaN instead of infinity for zero,
2854 * and NaN instead of zero for infinity.
2855 * Also, need to ensure rsqrt(1.0) == 1.0.
2856 * All numbers smaller than FLT_MIN will result in +infinity
2857 * (rsqrtps treats all denormals as zero).
2858 */
2859 LLVMValueRef cmp;
2860 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2861 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2862
2863 for (i = 0; i < num_iterations; ++i) {
2864 res = lp_build_rsqrt_refine(bld, a, res);
2865 }
2866 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2867 res = lp_build_select(bld, cmp, inf, res);
2868 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2869 res = lp_build_select(bld, cmp, bld->zero, res);
2870 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2871 res = lp_build_select(bld, cmp, bld->one, res);
2872 }
2873
2874 return res;
2875 }
2876
2877 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2878 }
2879
2880 /**
2881 * If there's a fast (inaccurate) rsqrt instruction available
2882 * (caller may want to avoid to call rsqrt_fast if it's not available,
2883 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2884 * unavailable it would result in sqrt/div/mul so obviously
2885 * much better to just call sqrt, skipping both div and mul).
2886 */
2887 boolean
2888 lp_build_fast_rsqrt_available(struct lp_type type)
2889 {
2890 assert(type.floating);
2891
2892 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2893 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2894 return true;
2895 }
2896 return false;
2897 }
2898
2899
2900 /**
2901 * Generate 1/sqrt(a).
2902 * Result is undefined for values < 0, infinity for +0.
2903 * Precision is limited, only ~10 bits guaranteed
2904 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2905 */
2906 LLVMValueRef
2907 lp_build_fast_rsqrt(struct lp_build_context *bld,
2908 LLVMValueRef a)
2909 {
2910 LLVMBuilderRef builder = bld->gallivm->builder;
2911 const struct lp_type type = bld->type;
2912
2913 assert(lp_check_value(type, a));
2914
2915 if (lp_build_fast_rsqrt_available(type)) {
2916 const char *intrinsic = NULL;
2917
2918 if (type.length == 4) {
2919 intrinsic = "llvm.x86.sse.rsqrt.ps";
2920 }
2921 else {
2922 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2923 }
2924 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2925 }
2926 else {
2927 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2928 }
2929 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2930 }
2931
2932
2933 /**
2934 * Generate sin(a) or cos(a) using polynomial approximation.
2935 * TODO: it might be worth recognizing sin and cos using same source
2936 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2937 * would be way cheaper than calculating (nearly) everything twice...
2938 * Not sure it's common enough to be worth bothering however, scs
2939 * opcode could also benefit from calculating both though.
2940 */
2941 static LLVMValueRef
2942 lp_build_sin_or_cos(struct lp_build_context *bld,
2943 LLVMValueRef a,
2944 boolean cos)
2945 {
2946 struct gallivm_state *gallivm = bld->gallivm;
2947 LLVMBuilderRef b = gallivm->builder;
2948 struct lp_type int_type = lp_int_type(bld->type);
2949
2950 /*
2951 * take the absolute value,
2952 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2953 */
2954
2955 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2956 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2957
2958 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2959 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2960
2961 /*
2962 * scale by 4/Pi
2963 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2964 */
2965
2966 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2967 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2968
2969 /*
2970 * store the integer part of y in mm0
2971 * emm2 = _mm_cvttps_epi32(y);
2972 */
2973
2974 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2975
2976 /*
2977 * j=(j+1) & (~1) (see the cephes sources)
2978 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2979 */
2980
2981 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2982 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2983 /*
2984 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2985 */
2986 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2987 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2988
2989 /*
2990 * y = _mm_cvtepi32_ps(emm2);
2991 */
2992 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2993
2994 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2995 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2996 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2997 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2998
2999 /*
3000 * Argument used for poly selection and sign bit determination
3001 * is different for sin vs. cos.
3002 */
3003 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
3004 emm2_and;
3005
3006 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
3007 LLVMBuildNot(b, emm2_2, ""), ""),
3008 const_29, "sign_bit") :
3009 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
3010 LLVMBuildShl(b, emm2_add,
3011 const_29, ""), ""),
3012 sign_mask, "sign_bit");
3013
3014 /*
3015 * get the polynom selection mask
3016 * there is one polynom for 0 <= x <= Pi/4
3017 * and another one for Pi/4<x<=Pi/2
3018 * Both branches will be computed.
3019 *
3020 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3021 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3022 */
3023
3024 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3025 LLVMValueRef poly_mask = lp_build_compare(gallivm,
3026 int_type, PIPE_FUNC_EQUAL,
3027 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3028
3029 /*
3030 * _PS_CONST(minus_cephes_DP1, -0.78515625);
3031 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3032 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3033 */
3034 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3035 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3036 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3037
3038 /*
3039 * The magic pass: "Extended precision modular arithmetic"
3040 * x = ((x - y * DP1) - y * DP2) - y * DP3;
3041 */
3042 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3043 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3044 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3045
3046 /*
3047 * Evaluate the first polynom (0 <= x <= Pi/4)
3048 *
3049 * z = _mm_mul_ps(x,x);
3050 */
3051 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3052
3053 /*
3054 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3055 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3056 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3057 */
3058 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3059 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3060 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3061
3062 /*
3063 * y = *(v4sf*)_ps_coscof_p0;
3064 * y = _mm_mul_ps(y, z);
3065 */
3066 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3067 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3068 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3069 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3070
3071
3072 /*
3073 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3074 * y = _mm_sub_ps(y, tmp);
3075 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3076 */
3077 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3078 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3079 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3080 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3081 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3082
3083 /*
3084 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3085 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3086 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3087 */
3088 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3089 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3090 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3091
3092 /*
3093 * Evaluate the second polynom (Pi/4 <= x <= 0)
3094 *
3095 * y2 = *(v4sf*)_ps_sincof_p0;
3096 * y2 = _mm_mul_ps(y2, z);
3097 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3098 * y2 = _mm_mul_ps(y2, z);
3099 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3100 * y2 = _mm_mul_ps(y2, z);
3101 * y2 = _mm_mul_ps(y2, x);
3102 * y2 = _mm_add_ps(y2, x);
3103 */
3104
3105 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3106 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3107 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3108 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3109
3110 /*
3111 * select the correct result from the two polynoms
3112 * xmm3 = poly_mask;
3113 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3114 * y = _mm_andnot_ps(xmm3, y);
3115 * y = _mm_or_ps(y,y2);
3116 */
3117 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3118 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3119 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3120 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3121 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3122 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3123
3124 /*
3125 * update the sign
3126 * y = _mm_xor_ps(y, sign_bit);
3127 */
3128 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3129 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3130
3131 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3132
3133 /* clamp output to be within [-1, 1] */
3134 y_result = lp_build_clamp(bld, y_result,
3135 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3136 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3137 /* If a is -inf, inf or NaN then return NaN */
3138 y_result = lp_build_select(bld, isfinite, y_result,
3139 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3140 return y_result;
3141 }
3142
3143
3144 /**
3145 * Generate sin(a)
3146 */
3147 LLVMValueRef
3148 lp_build_sin(struct lp_build_context *bld,
3149 LLVMValueRef a)
3150 {
3151 return lp_build_sin_or_cos(bld, a, FALSE);
3152 }
3153
3154
3155 /**
3156 * Generate cos(a)
3157 */
3158 LLVMValueRef
3159 lp_build_cos(struct lp_build_context *bld,
3160 LLVMValueRef a)
3161 {
3162 return lp_build_sin_or_cos(bld, a, TRUE);
3163 }
3164
3165
3166 /**
3167 * Generate pow(x, y)
3168 */
3169 LLVMValueRef
3170 lp_build_pow(struct lp_build_context *bld,
3171 LLVMValueRef x,
3172 LLVMValueRef y)
3173 {
3174 /* TODO: optimize the constant case */
3175 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3176 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3177 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3178 __FUNCTION__);
3179 }
3180
3181 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3182 }
3183
3184
3185 /**
3186 * Generate exp(x)
3187 */
3188 LLVMValueRef
3189 lp_build_exp(struct lp_build_context *bld,
3190 LLVMValueRef x)
3191 {
3192 /* log2(e) = 1/log(2) */
3193 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3194 1.4426950408889634);
3195
3196 assert(lp_check_value(bld->type, x));
3197
3198 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3199 }
3200
3201
3202 /**
3203 * Generate log(x)
3204 * Behavior is undefined with infs, 0s and nans
3205 */
3206 LLVMValueRef
3207 lp_build_log(struct lp_build_context *bld,
3208 LLVMValueRef x)
3209 {
3210 /* log(2) */
3211 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3212 0.69314718055994529);
3213
3214 assert(lp_check_value(bld->type, x));
3215
3216 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3217 }
3218
3219 /**
3220 * Generate log(x) that handles edge cases (infs, 0s and nans)
3221 */
3222 LLVMValueRef
3223 lp_build_log_safe(struct lp_build_context *bld,
3224 LLVMValueRef x)
3225 {
3226 /* log(2) */
3227 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3228 0.69314718055994529);
3229
3230 assert(lp_check_value(bld->type, x));
3231
3232 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3233 }
3234
3235
3236 /**
3237 * Generate polynomial.
3238 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3239 */
3240 LLVMValueRef
3241 lp_build_polynomial(struct lp_build_context *bld,
3242 LLVMValueRef x,
3243 const double *coeffs,
3244 unsigned num_coeffs)
3245 {
3246 const struct lp_type type = bld->type;
3247 LLVMValueRef even = NULL, odd = NULL;
3248 LLVMValueRef x2;
3249 unsigned i;
3250
3251 assert(lp_check_value(bld->type, x));
3252
3253 /* TODO: optimize the constant case */
3254 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3255 LLVMIsConstant(x)) {
3256 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3257 __FUNCTION__);
3258 }
3259
3260 /*
3261 * Calculate odd and even terms seperately to decrease data dependency
3262 * Ex:
3263 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3264 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3265 */
3266 x2 = lp_build_mul(bld, x, x);
3267
3268 for (i = num_coeffs; i--; ) {
3269 LLVMValueRef coeff;
3270
3271 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3272
3273 if (i % 2 == 0) {
3274 if (even)
3275 even = lp_build_mad(bld, x2, even, coeff);
3276 else
3277 even = coeff;
3278 } else {
3279 if (odd)
3280 odd = lp_build_mad(bld, x2, odd, coeff);
3281 else
3282 odd = coeff;
3283 }
3284 }
3285
3286 if (odd)
3287 return lp_build_mad(bld, odd, x, even);
3288 else if (even)
3289 return even;
3290 else
3291 return bld->undef;
3292 }
3293
3294
3295 /**
3296 * Minimax polynomial fit of 2**x, in range [0, 1[
3297 */
3298 const double lp_build_exp2_polynomial[] = {
3299 #if EXP_POLY_DEGREE == 5
3300 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3301 0.693153073200168932794,
3302 0.240153617044375388211,
3303 0.0558263180532956664775,
3304 0.00898934009049466391101,
3305 0.00187757667519147912699
3306 #elif EXP_POLY_DEGREE == 4
3307 1.00000259337069434683,
3308 0.693003834469974940458,
3309 0.24144275689150793076,
3310 0.0520114606103070150235,
3311 0.0135341679161270268764
3312 #elif EXP_POLY_DEGREE == 3
3313 0.999925218562710312959,
3314 0.695833540494823811697,
3315 0.226067155427249155588,
3316 0.0780245226406372992967
3317 #elif EXP_POLY_DEGREE == 2
3318 1.00172476321474503578,
3319 0.657636275736077639316,
3320 0.33718943461968720704
3321 #else
3322 #error
3323 #endif
3324 };
3325
3326
3327 LLVMValueRef
3328 lp_build_exp2(struct lp_build_context *bld,
3329 LLVMValueRef x)
3330 {
3331 LLVMBuilderRef builder = bld->gallivm->builder;
3332 const struct lp_type type = bld->type;
3333 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3334 LLVMValueRef ipart = NULL;
3335 LLVMValueRef fpart = NULL;
3336 LLVMValueRef expipart = NULL;
3337 LLVMValueRef expfpart = NULL;
3338 LLVMValueRef res = NULL;
3339
3340 assert(lp_check_value(bld->type, x));
3341
3342 /* TODO: optimize the constant case */
3343 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3344 LLVMIsConstant(x)) {
3345 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3346 __FUNCTION__);
3347 }
3348
3349 assert(type.floating && type.width == 32);
3350
3351 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3352 * the result is INF and if it's smaller than -126.9 the result is 0 */
3353 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3354 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3355 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3356 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3357
3358 /* ipart = floor(x) */
3359 /* fpart = x - ipart */
3360 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3361
3362 /* expipart = (float) (1 << ipart) */
3363 expipart = LLVMBuildAdd(builder, ipart,
3364 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3365 expipart = LLVMBuildShl(builder, expipart,
3366 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3367 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3368
3369 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3370 ARRAY_SIZE(lp_build_exp2_polynomial));
3371
3372 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3373
3374 return res;
3375 }
3376
3377
3378
3379 /**
3380 * Extract the exponent of a IEEE-754 floating point value.
3381 *
3382 * Optionally apply an integer bias.
3383 *
3384 * Result is an integer value with
3385 *
3386 * ifloor(log2(x)) + bias
3387 */
3388 LLVMValueRef
3389 lp_build_extract_exponent(struct lp_build_context *bld,
3390 LLVMValueRef x,
3391 int bias)
3392 {
3393 LLVMBuilderRef builder = bld->gallivm->builder;
3394 const struct lp_type type = bld->type;
3395 unsigned mantissa = lp_mantissa(type);
3396 LLVMValueRef res;
3397
3398 assert(type.floating);
3399
3400 assert(lp_check_value(bld->type, x));
3401
3402 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3403
3404 res = LLVMBuildLShr(builder, x,
3405 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3406 res = LLVMBuildAnd(builder, res,
3407 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3408 res = LLVMBuildSub(builder, res,
3409 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3410
3411 return res;
3412 }
3413
3414
3415 /**
3416 * Extract the mantissa of the a floating.
3417 *
3418 * Result is a floating point value with
3419 *
3420 * x / floor(log2(x))
3421 */
3422 LLVMValueRef
3423 lp_build_extract_mantissa(struct lp_build_context *bld,
3424 LLVMValueRef x)
3425 {
3426 LLVMBuilderRef builder = bld->gallivm->builder;
3427 const struct lp_type type = bld->type;
3428 unsigned mantissa = lp_mantissa(type);
3429 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3430 (1ULL << mantissa) - 1);
3431 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3432 LLVMValueRef res;
3433
3434 assert(lp_check_value(bld->type, x));
3435
3436 assert(type.floating);
3437
3438 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3439
3440 /* res = x / 2**ipart */
3441 res = LLVMBuildAnd(builder, x, mantmask, "");
3442 res = LLVMBuildOr(builder, res, one, "");
3443 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3444
3445 return res;
3446 }
3447
3448
3449
3450 /**
3451 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3452 * These coefficients can be generate with
3453 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3454 */
3455 const double lp_build_log2_polynomial[] = {
3456 #if LOG_POLY_DEGREE == 5
3457 2.88539008148777786488L,
3458 0.961796878841293367824L,
3459 0.577058946784739859012L,
3460 0.412914355135828735411L,
3461 0.308591899232910175289L,
3462 0.352376952300281371868L,
3463 #elif LOG_POLY_DEGREE == 4
3464 2.88539009343309178325L,
3465 0.961791550404184197881L,
3466 0.577440339438736392009L,
3467 0.403343858251329912514L,
3468 0.406718052498846252698L,
3469 #elif LOG_POLY_DEGREE == 3
3470 2.88538959748872753838L,
3471 0.961932915889597772928L,
3472 0.571118517972136195241L,
3473 0.493997535084709500285L,
3474 #else
3475 #error
3476 #endif
3477 };
3478
3479 /**
3480 * See http://www.devmaster.net/forums/showthread.php?p=43580
3481 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3482 * http://www.nezumi.demon.co.uk/consult/logx.htm
3483 *
3484 * If handle_edge_cases is true the function will perform computations
3485 * to match the required D3D10+ behavior for each of the edge cases.
3486 * That means that if input is:
3487 * - less than zero (to and including -inf) then NaN will be returned
3488 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3489 * - +infinity, then +infinity will be returned
3490 * - NaN, then NaN will be returned
3491 *
3492 * Those checks are fairly expensive so if you don't need them make sure
3493 * handle_edge_cases is false.
3494 */
3495 void
3496 lp_build_log2_approx(struct lp_build_context *bld,
3497 LLVMValueRef x,
3498 LLVMValueRef *p_exp,
3499 LLVMValueRef *p_floor_log2,
3500 LLVMValueRef *p_log2,
3501 boolean handle_edge_cases)
3502 {
3503 LLVMBuilderRef builder = bld->gallivm->builder;
3504 const struct lp_type type = bld->type;
3505 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3506 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3507
3508 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3509 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3510 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3511
3512 LLVMValueRef i = NULL;
3513 LLVMValueRef y = NULL;
3514 LLVMValueRef z = NULL;
3515 LLVMValueRef exp = NULL;
3516 LLVMValueRef mant = NULL;
3517 LLVMValueRef logexp = NULL;
3518 LLVMValueRef p_z = NULL;
3519 LLVMValueRef res = NULL;
3520
3521 assert(lp_check_value(bld->type, x));
3522
3523 if(p_exp || p_floor_log2 || p_log2) {
3524 /* TODO: optimize the constant case */
3525 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3526 LLVMIsConstant(x)) {
3527 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3528 __FUNCTION__);
3529 }
3530
3531 assert(type.floating && type.width == 32);
3532
3533 /*
3534 * We don't explicitly handle denormalized numbers. They will yield a
3535 * result in the neighbourhood of -127, which appears to be adequate
3536 * enough.
3537 */
3538
3539 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3540
3541 /* exp = (float) exponent(x) */
3542 exp = LLVMBuildAnd(builder, i, expmask, "");
3543 }
3544
3545 if(p_floor_log2 || p_log2) {
3546 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3547 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3548 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3549 }
3550
3551 if (p_log2) {
3552 /* mant = 1 + (float) mantissa(x) */
3553 mant = LLVMBuildAnd(builder, i, mantmask, "");
3554 mant = LLVMBuildOr(builder, mant, one, "");
3555 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3556
3557 /* y = (mant - 1) / (mant + 1) */
3558 y = lp_build_div(bld,
3559 lp_build_sub(bld, mant, bld->one),
3560 lp_build_add(bld, mant, bld->one)
3561 );
3562
3563 /* z = y^2 */
3564 z = lp_build_mul(bld, y, y);
3565
3566 /* compute P(z) */
3567 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3568 ARRAY_SIZE(lp_build_log2_polynomial));
3569
3570 /* y * P(z) + logexp */
3571 res = lp_build_mad(bld, y, p_z, logexp);
3572
3573 if (type.floating && handle_edge_cases) {
3574 LLVMValueRef negmask, infmask, zmask;
3575 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3576 lp_build_const_vec(bld->gallivm, type, 0.0f));
3577 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3578 lp_build_const_vec(bld->gallivm, type, 0.0f));
3579 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3580 lp_build_const_vec(bld->gallivm, type, INFINITY));
3581
3582 /* If x is qual to inf make sure we return inf */
3583 res = lp_build_select(bld, infmask,
3584 lp_build_const_vec(bld->gallivm, type, INFINITY),
3585 res);
3586 /* If x is qual to 0, return -inf */
3587 res = lp_build_select(bld, zmask,
3588 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3589 res);
3590 /* If x is nan or less than 0, return nan */
3591 res = lp_build_select(bld, negmask,
3592 lp_build_const_vec(bld->gallivm, type, NAN),
3593 res);
3594 }
3595 }
3596
3597 if (p_exp) {
3598 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3599 *p_exp = exp;
3600 }
3601
3602 if (p_floor_log2)
3603 *p_floor_log2 = logexp;
3604
3605 if (p_log2)
3606 *p_log2 = res;
3607 }
3608
3609
3610 /*
3611 * log2 implementation which doesn't have special code to
3612 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3613 * the results for those cases are undefined.
3614 */
3615 LLVMValueRef
3616 lp_build_log2(struct lp_build_context *bld,
3617 LLVMValueRef x)
3618 {
3619 LLVMValueRef res;
3620 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3621 return res;
3622 }
3623
3624 /*
3625 * Version of log2 which handles all edge cases.
3626 * Look at documentation of lp_build_log2_approx for
3627 * description of the behavior for each of the edge cases.
3628 */
3629 LLVMValueRef
3630 lp_build_log2_safe(struct lp_build_context *bld,
3631 LLVMValueRef x)
3632 {
3633 LLVMValueRef res;
3634 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3635 return res;
3636 }
3637
3638
3639 /**
3640 * Faster (and less accurate) log2.
3641 *
3642 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3643 *
3644 * Piece-wise linear approximation, with exact results when x is a
3645 * power of two.
3646 *
3647 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3648 */
3649 LLVMValueRef
3650 lp_build_fast_log2(struct lp_build_context *bld,
3651 LLVMValueRef x)
3652 {
3653 LLVMBuilderRef builder = bld->gallivm->builder;
3654 LLVMValueRef ipart;
3655 LLVMValueRef fpart;
3656
3657 assert(lp_check_value(bld->type, x));
3658
3659 assert(bld->type.floating);
3660
3661 /* ipart = floor(log2(x)) - 1 */
3662 ipart = lp_build_extract_exponent(bld, x, -1);
3663 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3664
3665 /* fpart = x / 2**ipart */
3666 fpart = lp_build_extract_mantissa(bld, x);
3667
3668 /* ipart + fpart */
3669 return LLVMBuildFAdd(builder, ipart, fpart, "");
3670 }
3671
3672
3673 /**
3674 * Fast implementation of iround(log2(x)).
3675 *
3676 * Not an approximation -- it should give accurate results all the time.
3677 */
3678 LLVMValueRef
3679 lp_build_ilog2(struct lp_build_context *bld,
3680 LLVMValueRef x)
3681 {
3682 LLVMBuilderRef builder = bld->gallivm->builder;
3683 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3684 LLVMValueRef ipart;
3685
3686 assert(bld->type.floating);
3687
3688 assert(lp_check_value(bld->type, x));
3689
3690 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3691 x = LLVMBuildFMul(builder, x, sqrt2, "");
3692
3693 /* ipart = floor(log2(x) + 0.5) */
3694 ipart = lp_build_extract_exponent(bld, x, 0);
3695
3696 return ipart;
3697 }
3698
3699 LLVMValueRef
3700 lp_build_mod(struct lp_build_context *bld,
3701 LLVMValueRef x,
3702 LLVMValueRef y)
3703 {
3704 LLVMBuilderRef builder = bld->gallivm->builder;
3705 LLVMValueRef res;
3706 const struct lp_type type = bld->type;
3707
3708 assert(lp_check_value(type, x));
3709 assert(lp_check_value(type, y));
3710
3711 if (type.floating)
3712 res = LLVMBuildFRem(builder, x, y, "");
3713 else if (type.sign)
3714 res = LLVMBuildSRem(builder, x, y, "");
3715 else
3716 res = LLVMBuildURem(builder, x, y, "");
3717 return res;
3718 }
3719
3720
3721 /*
3722 * For floating inputs it creates and returns a mask
3723 * which is all 1's for channels which are NaN.
3724 * Channels inside x which are not NaN will be 0.
3725 */
3726 LLVMValueRef
3727 lp_build_isnan(struct lp_build_context *bld,
3728 LLVMValueRef x)
3729 {
3730 LLVMValueRef mask;
3731 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3732
3733 assert(bld->type.floating);
3734 assert(lp_check_value(bld->type, x));
3735
3736 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3737 "isnotnan");
3738 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3739 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3740 return mask;
3741 }
3742
3743 /* Returns all 1's for floating point numbers that are
3744 * finite numbers and returns all zeros for -inf,
3745 * inf and nan's */
3746 LLVMValueRef
3747 lp_build_isfinite(struct lp_build_context *bld,
3748 LLVMValueRef x)
3749 {
3750 LLVMBuilderRef builder = bld->gallivm->builder;
3751 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3752 struct lp_type int_type = lp_int_type(bld->type);
3753 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3754 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3755 0x7f800000);
3756
3757 if (!bld->type.floating) {
3758 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3759 }
3760 assert(bld->type.floating);
3761 assert(lp_check_value(bld->type, x));
3762 assert(bld->type.width == 32);
3763
3764 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3765 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3766 intx, infornan32);
3767 }
3768
3769 /*
3770 * Returns true if the number is nan or inf and false otherwise.
3771 * The input has to be a floating point vector.
3772 */
3773 LLVMValueRef
3774 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3775 const struct lp_type type,
3776 LLVMValueRef x)
3777 {
3778 LLVMBuilderRef builder = gallivm->builder;
3779 struct lp_type int_type = lp_int_type(type);
3780 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3781 0x7f800000);
3782 LLVMValueRef ret;
3783
3784 assert(type.floating);
3785
3786 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3787 ret = LLVMBuildAnd(builder, ret, const0, "");
3788 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3789 ret, const0);
3790
3791 return ret;
3792 }
3793
3794
3795 LLVMValueRef
3796 lp_build_fpstate_get(struct gallivm_state *gallivm)
3797 {
3798 if (util_cpu_caps.has_sse) {
3799 LLVMBuilderRef builder = gallivm->builder;
3800 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3801 gallivm,
3802 LLVMInt32TypeInContext(gallivm->context),
3803 "mxcsr_ptr");
3804 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3805 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3806 lp_build_intrinsic(builder,
3807 "llvm.x86.sse.stmxcsr",
3808 LLVMVoidTypeInContext(gallivm->context),
3809 &mxcsr_ptr8, 1, 0);
3810 return mxcsr_ptr;
3811 }
3812 return 0;
3813 }
3814
3815 void
3816 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3817 boolean zero)
3818 {
3819 if (util_cpu_caps.has_sse) {
3820 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3821 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3822
3823 LLVMBuilderRef builder = gallivm->builder;
3824 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3825 LLVMValueRef mxcsr =
3826 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3827
3828 if (util_cpu_caps.has_daz) {
3829 /* Enable denormals are zero mode */
3830 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3831 }
3832 if (zero) {
3833 mxcsr = LLVMBuildOr(builder, mxcsr,
3834 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3835 } else {
3836 mxcsr = LLVMBuildAnd(builder, mxcsr,
3837 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3838 }
3839
3840 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3841 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3842 }
3843 }
3844
3845 void
3846 lp_build_fpstate_set(struct gallivm_state *gallivm,
3847 LLVMValueRef mxcsr_ptr)
3848 {
3849 if (util_cpu_caps.has_sse) {
3850 LLVMBuilderRef builder = gallivm->builder;
3851 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3852 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3853 lp_build_intrinsic(builder,
3854 "llvm.x86.sse.ldmxcsr",
3855 LLVMVoidTypeInContext(gallivm->context),
3856 &mxcsr_ptr, 1, 0);
3857 }
3858 }