gallivm: Return true from arch_rounding_available() if NEON is available
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if (a == bld->zero)
545 return b;
546 if (b == bld->zero)
547 return a;
548 if (a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if (type.norm) {
552 const char *intrinsic = NULL;
553
554 if (!type.sign && (a == bld->one || b == bld->one))
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if (util_cpu_caps.has_sse2) {
560 if (type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
562 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
563 if (type.width == 16)
564 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
565 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
566 } else if (util_cpu_caps.has_altivec) {
567 if (type.width == 8)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
569 if (type.width == 16)
570 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
571 }
572 }
573 if (type.width * type.length == 256) {
574 if (util_cpu_caps.has_avx2) {
575 if (type.width == 8)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
577 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
578 if (type.width == 16)
579 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
580 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
581 }
582 }
583 }
584
585 if (intrinsic)
586 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
587 }
588
589 if(type.norm && !type.floating && !type.fixed) {
590 if (type.sign) {
591 uint64_t sign = (uint64_t)1 << (type.width - 1);
592 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
593 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
594 /* a_clamp_max is the maximum a for positive b,
595 a_clamp_min is the minimum a for negative b. */
596 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
598 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
599 }
600 }
601
602 if(LLVMIsConstant(a) && LLVMIsConstant(b))
603 if (type.floating)
604 res = LLVMConstFAdd(a, b);
605 else
606 res = LLVMConstAdd(a, b);
607 else
608 if (type.floating)
609 res = LLVMBuildFAdd(builder, a, b, "");
610 else
611 res = LLVMBuildAdd(builder, a, b, "");
612
613 /* clamp to ceiling of 1.0 */
614 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
615 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
616
617 if (type.norm && !type.floating && !type.fixed) {
618 if (!type.sign) {
619 /*
620 * newer llvm versions no longer support the intrinsics, but recognize
621 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
622 * code, it is important we match the pattern llvm uses (and pray llvm
623 * doesn't change it - and hope they decide on the same pattern for
624 * all backends supporting it...).
625 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
626 * interfere with llvm's ability to recognize the pattern but seems
627 * a bit brittle.
628 */
629 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
630 res = lp_build_select(bld, overflowed,
631 LLVMConstAllOnes(bld->int_vec_type), res);
632 }
633 }
634
635 /* XXX clamp to floor of -1 or 0??? */
636
637 return res;
638 }
639
640
641 /** Return the scalar sum of the elements of a.
642 * Should avoid this operation whenever possible.
643 */
644 LLVMValueRef
645 lp_build_horizontal_add(struct lp_build_context *bld,
646 LLVMValueRef a)
647 {
648 LLVMBuilderRef builder = bld->gallivm->builder;
649 const struct lp_type type = bld->type;
650 LLVMValueRef index, res;
651 unsigned i, length;
652 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
653 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
654 LLVMValueRef vecres, elem2;
655
656 assert(lp_check_value(type, a));
657
658 if (type.length == 1) {
659 return a;
660 }
661
662 assert(!bld->type.norm);
663
664 /*
665 * for byte vectors can do much better with psadbw.
666 * Using repeated shuffle/adds here. Note with multiple vectors
667 * this can be done more efficiently as outlined in the intel
668 * optimization manual.
669 * Note: could cause data rearrangement if used with smaller element
670 * sizes.
671 */
672
673 vecres = a;
674 length = type.length / 2;
675 while (length > 1) {
676 LLVMValueRef vec1, vec2;
677 for (i = 0; i < length; i++) {
678 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
679 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
680 }
681 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
682 LLVMConstVector(shuffles1, length), "");
683 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
684 LLVMConstVector(shuffles2, length), "");
685 if (type.floating) {
686 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
687 }
688 else {
689 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
690 }
691 length = length >> 1;
692 }
693
694 /* always have vector of size 2 here */
695 assert(length == 1);
696
697 index = lp_build_const_int32(bld->gallivm, 0);
698 res = LLVMBuildExtractElement(builder, vecres, index, "");
699 index = lp_build_const_int32(bld->gallivm, 1);
700 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
701
702 if (type.floating)
703 res = LLVMBuildFAdd(builder, res, elem2, "");
704 else
705 res = LLVMBuildAdd(builder, res, elem2, "");
706
707 return res;
708 }
709
710 /**
711 * Return the horizontal sums of 4 float vectors as a float4 vector.
712 * This uses the technique as outlined in Intel Optimization Manual.
713 */
714 static LLVMValueRef
715 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
716 LLVMValueRef src[4])
717 {
718 struct gallivm_state *gallivm = bld->gallivm;
719 LLVMBuilderRef builder = gallivm->builder;
720 LLVMValueRef shuffles[4];
721 LLVMValueRef tmp[4];
722 LLVMValueRef sumtmp[2], shuftmp[2];
723
724 /* lower half of regs */
725 shuffles[0] = lp_build_const_int32(gallivm, 0);
726 shuffles[1] = lp_build_const_int32(gallivm, 1);
727 shuffles[2] = lp_build_const_int32(gallivm, 4);
728 shuffles[3] = lp_build_const_int32(gallivm, 5);
729 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
730 LLVMConstVector(shuffles, 4), "");
731 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
732 LLVMConstVector(shuffles, 4), "");
733
734 /* upper half of regs */
735 shuffles[0] = lp_build_const_int32(gallivm, 2);
736 shuffles[1] = lp_build_const_int32(gallivm, 3);
737 shuffles[2] = lp_build_const_int32(gallivm, 6);
738 shuffles[3] = lp_build_const_int32(gallivm, 7);
739 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
740 LLVMConstVector(shuffles, 4), "");
741 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
742 LLVMConstVector(shuffles, 4), "");
743
744 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
745 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
746
747 shuffles[0] = lp_build_const_int32(gallivm, 0);
748 shuffles[1] = lp_build_const_int32(gallivm, 2);
749 shuffles[2] = lp_build_const_int32(gallivm, 4);
750 shuffles[3] = lp_build_const_int32(gallivm, 6);
751 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
752 LLVMConstVector(shuffles, 4), "");
753
754 shuffles[0] = lp_build_const_int32(gallivm, 1);
755 shuffles[1] = lp_build_const_int32(gallivm, 3);
756 shuffles[2] = lp_build_const_int32(gallivm, 5);
757 shuffles[3] = lp_build_const_int32(gallivm, 7);
758 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
759 LLVMConstVector(shuffles, 4), "");
760
761 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
762 }
763
764
765 /*
766 * partially horizontally add 2-4 float vectors with length nx4,
767 * i.e. only four adjacent values in each vector will be added,
768 * assuming values are really grouped in 4 which also determines
769 * output order.
770 *
771 * Return a vector of the same length as the initial vectors,
772 * with the excess elements (if any) being undefined.
773 * The element order is independent of number of input vectors.
774 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
775 * the output order thus will be
776 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
777 */
778 LLVMValueRef
779 lp_build_hadd_partial4(struct lp_build_context *bld,
780 LLVMValueRef vectors[],
781 unsigned num_vecs)
782 {
783 struct gallivm_state *gallivm = bld->gallivm;
784 LLVMBuilderRef builder = gallivm->builder;
785 LLVMValueRef ret_vec;
786 LLVMValueRef tmp[4];
787 const char *intrinsic = NULL;
788
789 assert(num_vecs >= 2 && num_vecs <= 4);
790 assert(bld->type.floating);
791
792 /* only use this with at least 2 vectors, as it is sort of expensive
793 * (depending on cpu) and we always need two horizontal adds anyway,
794 * so a shuffle/add approach might be better.
795 */
796
797 tmp[0] = vectors[0];
798 tmp[1] = vectors[1];
799
800 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
801 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
802
803 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
804 bld->type.length == 4) {
805 intrinsic = "llvm.x86.sse3.hadd.ps";
806 }
807 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
808 bld->type.length == 8) {
809 intrinsic = "llvm.x86.avx.hadd.ps.256";
810 }
811 if (intrinsic) {
812 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
813 lp_build_vec_type(gallivm, bld->type),
814 tmp[0], tmp[1]);
815 if (num_vecs > 2) {
816 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
817 lp_build_vec_type(gallivm, bld->type),
818 tmp[2], tmp[3]);
819 }
820 else {
821 tmp[1] = tmp[0];
822 }
823 return lp_build_intrinsic_binary(builder, intrinsic,
824 lp_build_vec_type(gallivm, bld->type),
825 tmp[0], tmp[1]);
826 }
827
828 if (bld->type.length == 4) {
829 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
830 }
831 else {
832 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
833 unsigned j;
834 unsigned num_iter = bld->type.length / 4;
835 struct lp_type parttype = bld->type;
836 parttype.length = 4;
837 for (j = 0; j < num_iter; j++) {
838 LLVMValueRef partsrc[4];
839 unsigned i;
840 for (i = 0; i < 4; i++) {
841 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
842 }
843 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
844 }
845 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
846 }
847 return ret_vec;
848 }
849
850 /**
851 * Generate a - b
852 */
853 LLVMValueRef
854 lp_build_sub(struct lp_build_context *bld,
855 LLVMValueRef a,
856 LLVMValueRef b)
857 {
858 LLVMBuilderRef builder = bld->gallivm->builder;
859 const struct lp_type type = bld->type;
860 LLVMValueRef res;
861
862 assert(lp_check_value(type, a));
863 assert(lp_check_value(type, b));
864
865 if (b == bld->zero)
866 return a;
867 if (a == bld->undef || b == bld->undef)
868 return bld->undef;
869 if (a == b)
870 return bld->zero;
871
872 if (type.norm) {
873 const char *intrinsic = NULL;
874
875 if (!type.sign && b == bld->one)
876 return bld->zero;
877
878 if (!type.floating && !type.fixed) {
879 if (type.width * type.length == 128) {
880 if (util_cpu_caps.has_sse2) {
881 if (type.width == 8)
882 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
883 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
884 if (type.width == 16)
885 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
886 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
887 } else if (util_cpu_caps.has_altivec) {
888 if (type.width == 8)
889 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
890 if (type.width == 16)
891 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
892 }
893 }
894 if (type.width * type.length == 256) {
895 if (util_cpu_caps.has_avx2) {
896 if (type.width == 8)
897 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
898 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
899 if (type.width == 16)
900 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
901 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
902 }
903 }
904 }
905
906 if (intrinsic)
907 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
908 }
909
910 if(type.norm && !type.floating && !type.fixed) {
911 if (type.sign) {
912 uint64_t sign = (uint64_t)1 << (type.width - 1);
913 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
914 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
915 /* a_clamp_max is the maximum a for negative b,
916 a_clamp_min is the minimum a for positive b. */
917 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
918 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
919 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
920 } else {
921 /*
922 * This must match llvm pattern for saturated unsigned sub.
923 * (lp_build_max_simple actually does the job with its current
924 * definition but do it explicitly here.)
925 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
926 * interfere with llvm's ability to recognize the pattern but seems
927 * a bit brittle.
928 */
929 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
930 a = lp_build_select(bld, no_ov, a, b);
931 }
932 }
933
934 if(LLVMIsConstant(a) && LLVMIsConstant(b))
935 if (type.floating)
936 res = LLVMConstFSub(a, b);
937 else
938 res = LLVMConstSub(a, b);
939 else
940 if (type.floating)
941 res = LLVMBuildFSub(builder, a, b, "");
942 else
943 res = LLVMBuildSub(builder, a, b, "");
944
945 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
946 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
947
948 return res;
949 }
950
951
952
953 /**
954 * Normalized multiplication.
955 *
956 * There are several approaches for (using 8-bit normalized multiplication as
957 * an example):
958 *
959 * - alpha plus one
960 *
961 * makes the following approximation to the division (Sree)
962 *
963 * a*b/255 ~= (a*(b + 1)) >> 256
964 *
965 * which is the fastest method that satisfies the following OpenGL criteria of
966 *
967 * 0*0 = 0 and 255*255 = 255
968 *
969 * - geometric series
970 *
971 * takes the geometric series approximation to the division
972 *
973 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
974 *
975 * in this case just the first two terms to fit in 16bit arithmetic
976 *
977 * t/255 ~= (t + (t >> 8)) >> 8
978 *
979 * note that just by itself it doesn't satisfies the OpenGL criteria, as
980 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
981 * must be used.
982 *
983 * - geometric series plus rounding
984 *
985 * when using a geometric series division instead of truncating the result
986 * use roundoff in the approximation (Jim Blinn)
987 *
988 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
989 *
990 * achieving the exact results.
991 *
992 *
993 *
994 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
995 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
996 * @sa Michael Herf, The "double blend trick", May 2000,
997 * http://www.stereopsis.com/doubleblend.html
998 */
999 LLVMValueRef
1000 lp_build_mul_norm(struct gallivm_state *gallivm,
1001 struct lp_type wide_type,
1002 LLVMValueRef a, LLVMValueRef b)
1003 {
1004 LLVMBuilderRef builder = gallivm->builder;
1005 struct lp_build_context bld;
1006 unsigned n;
1007 LLVMValueRef half;
1008 LLVMValueRef ab;
1009
1010 assert(!wide_type.floating);
1011 assert(lp_check_value(wide_type, a));
1012 assert(lp_check_value(wide_type, b));
1013
1014 lp_build_context_init(&bld, gallivm, wide_type);
1015
1016 n = wide_type.width / 2;
1017 if (wide_type.sign) {
1018 --n;
1019 }
1020
1021 /*
1022 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1023 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1024 */
1025
1026 /*
1027 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1028 */
1029
1030 ab = LLVMBuildMul(builder, a, b, "");
1031 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1032
1033 /*
1034 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1035 */
1036
1037 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1038 if (wide_type.sign) {
1039 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1040 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1041 half = lp_build_select(&bld, sign, minus_half, half);
1042 }
1043 ab = LLVMBuildAdd(builder, ab, half, "");
1044
1045 /* Final division */
1046 ab = lp_build_shr_imm(&bld, ab, n);
1047
1048 return ab;
1049 }
1050
1051 /**
1052 * Generate a * b
1053 */
1054 LLVMValueRef
1055 lp_build_mul(struct lp_build_context *bld,
1056 LLVMValueRef a,
1057 LLVMValueRef b)
1058 {
1059 LLVMBuilderRef builder = bld->gallivm->builder;
1060 const struct lp_type type = bld->type;
1061 LLVMValueRef shift;
1062 LLVMValueRef res;
1063
1064 assert(lp_check_value(type, a));
1065 assert(lp_check_value(type, b));
1066
1067 if(a == bld->zero)
1068 return bld->zero;
1069 if(a == bld->one)
1070 return b;
1071 if(b == bld->zero)
1072 return bld->zero;
1073 if(b == bld->one)
1074 return a;
1075 if(a == bld->undef || b == bld->undef)
1076 return bld->undef;
1077
1078 if (!type.floating && !type.fixed && type.norm) {
1079 struct lp_type wide_type = lp_wider_type(type);
1080 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1081
1082 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1083 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1084
1085 /* PMULLW, PSRLW, PADDW */
1086 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1087 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1088
1089 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1090
1091 return ab;
1092 }
1093
1094 if(type.fixed)
1095 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1096 else
1097 shift = NULL;
1098
1099 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1100 if (type.floating)
1101 res = LLVMConstFMul(a, b);
1102 else
1103 res = LLVMConstMul(a, b);
1104 if(shift) {
1105 if(type.sign)
1106 res = LLVMConstAShr(res, shift);
1107 else
1108 res = LLVMConstLShr(res, shift);
1109 }
1110 }
1111 else {
1112 if (type.floating)
1113 res = LLVMBuildFMul(builder, a, b, "");
1114 else
1115 res = LLVMBuildMul(builder, a, b, "");
1116 if(shift) {
1117 if(type.sign)
1118 res = LLVMBuildAShr(builder, res, shift, "");
1119 else
1120 res = LLVMBuildLShr(builder, res, shift, "");
1121 }
1122 }
1123
1124 return res;
1125 }
1126
1127 /*
1128 * Widening mul, valid for 32x32 bit -> 64bit only.
1129 * Result is low 32bits, high bits returned in res_hi.
1130 *
1131 * Emits code that is meant to be compiled for the host CPU.
1132 */
1133 LLVMValueRef
1134 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1135 LLVMValueRef a,
1136 LLVMValueRef b,
1137 LLVMValueRef *res_hi)
1138 {
1139 struct gallivm_state *gallivm = bld->gallivm;
1140 LLVMBuilderRef builder = gallivm->builder;
1141
1142 assert(bld->type.width == 32);
1143 assert(bld->type.floating == 0);
1144 assert(bld->type.fixed == 0);
1145 assert(bld->type.norm == 0);
1146
1147 /*
1148 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1149 * for x86 simd is atrocious (even if the high bits weren't required),
1150 * trying to handle real 64bit inputs (which of course can't happen due
1151 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1152 * apparently llvm does not recognize this widening mul). This includes 6
1153 * (instead of 2) pmuludq plus extra adds and shifts
1154 * The same story applies to signed mul, albeit fixing this requires sse41.
1155 * https://llvm.org/bugs/show_bug.cgi?id=30845
1156 * So, whip up our own code, albeit only for length 4 and 8 (which
1157 * should be good enough)...
1158 */
1159 if ((bld->type.length == 4 || bld->type.length == 8) &&
1160 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1161 util_cpu_caps.has_sse4_1)) {
1162 const char *intrinsic = NULL;
1163 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1164 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1165 struct lp_type type_wide = lp_wider_type(bld->type);
1166 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1167 unsigned i;
1168 for (i = 0; i < bld->type.length; i += 2) {
1169 shuf[i] = lp_build_const_int32(gallivm, i+1);
1170 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1171 }
1172 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1173 aeven = a;
1174 beven = b;
1175 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1176 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1177
1178 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1179 if (bld->type.sign) {
1180 intrinsic = "llvm.x86.avx2.pmul.dq";
1181 } else {
1182 intrinsic = "llvm.x86.avx2.pmulu.dq";
1183 }
1184 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1185 wider_type, aeven, beven);
1186 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1187 wider_type, aodd, bodd);
1188 }
1189 else {
1190 /* for consistent naming look elsewhere... */
1191 if (bld->type.sign) {
1192 intrinsic = "llvm.x86.sse41.pmuldq";
1193 } else {
1194 intrinsic = "llvm.x86.sse2.pmulu.dq";
1195 }
1196 /*
1197 * XXX If we only have AVX but not AVX2 this is a pain.
1198 * lp_build_intrinsic_binary_anylength() can't handle it
1199 * (due to src and dst type not being identical).
1200 */
1201 if (bld->type.length == 8) {
1202 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1203 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1204 LLVMValueRef muleven2[2], mulodd2[2];
1205 struct lp_type type_wide_half = type_wide;
1206 LLVMTypeRef wtype_half;
1207 type_wide_half.length = 2;
1208 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1209 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1210 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1211 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1212 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1213 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1214 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1215 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1216 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1217 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1218 wtype_half, aevenlo, bevenlo);
1219 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1220 wtype_half, aoddlo, boddlo);
1221 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1222 wtype_half, aevenhi, bevenhi);
1223 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1224 wtype_half, aoddhi, boddhi);
1225 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1226 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1227
1228 }
1229 else {
1230 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1231 wider_type, aeven, beven);
1232 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1233 wider_type, aodd, bodd);
1234 }
1235 }
1236 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1237 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1238
1239 for (i = 0; i < bld->type.length; i += 2) {
1240 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1241 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1242 }
1243 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1244 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1245
1246 for (i = 0; i < bld->type.length; i += 2) {
1247 shuf[i] = lp_build_const_int32(gallivm, i);
1248 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1249 }
1250 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1251 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1252 }
1253 else {
1254 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1255 }
1256 }
1257
1258
1259 /*
1260 * Widening mul, valid for 32x32 bit -> 64bit only.
1261 * Result is low 32bits, high bits returned in res_hi.
1262 *
1263 * Emits generic code.
1264 */
1265 LLVMValueRef
1266 lp_build_mul_32_lohi(struct lp_build_context *bld,
1267 LLVMValueRef a,
1268 LLVMValueRef b,
1269 LLVMValueRef *res_hi)
1270 {
1271 struct gallivm_state *gallivm = bld->gallivm;
1272 LLVMBuilderRef builder = gallivm->builder;
1273 LLVMValueRef tmp, shift, res_lo;
1274 struct lp_type type_tmp;
1275 LLVMTypeRef wide_type, narrow_type;
1276
1277 type_tmp = bld->type;
1278 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1279 type_tmp.width *= 2;
1280 wide_type = lp_build_vec_type(gallivm, type_tmp);
1281 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1282
1283 if (bld->type.sign) {
1284 a = LLVMBuildSExt(builder, a, wide_type, "");
1285 b = LLVMBuildSExt(builder, b, wide_type, "");
1286 } else {
1287 a = LLVMBuildZExt(builder, a, wide_type, "");
1288 b = LLVMBuildZExt(builder, b, wide_type, "");
1289 }
1290 tmp = LLVMBuildMul(builder, a, b, "");
1291
1292 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1293
1294 /* Since we truncate anyway, LShr and AShr are equivalent. */
1295 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1296 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1297
1298 return res_lo;
1299 }
1300
1301
1302 /* a * b + c */
1303 LLVMValueRef
1304 lp_build_mad(struct lp_build_context *bld,
1305 LLVMValueRef a,
1306 LLVMValueRef b,
1307 LLVMValueRef c)
1308 {
1309 const struct lp_type type = bld->type;
1310 if (type.floating) {
1311 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1312 } else {
1313 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1314 }
1315 }
1316
1317
1318 /**
1319 * Small vector x scale multiplication optimization.
1320 */
1321 LLVMValueRef
1322 lp_build_mul_imm(struct lp_build_context *bld,
1323 LLVMValueRef a,
1324 int b)
1325 {
1326 LLVMBuilderRef builder = bld->gallivm->builder;
1327 LLVMValueRef factor;
1328
1329 assert(lp_check_value(bld->type, a));
1330
1331 if(b == 0)
1332 return bld->zero;
1333
1334 if(b == 1)
1335 return a;
1336
1337 if(b == -1)
1338 return lp_build_negate(bld, a);
1339
1340 if(b == 2 && bld->type.floating)
1341 return lp_build_add(bld, a, a);
1342
1343 if(util_is_power_of_two_or_zero(b)) {
1344 unsigned shift = ffs(b) - 1;
1345
1346 if(bld->type.floating) {
1347 #if 0
1348 /*
1349 * Power of two multiplication by directly manipulating the exponent.
1350 *
1351 * XXX: This might not be always faster, it will introduce a small error
1352 * for multiplication by zero, and it will produce wrong results
1353 * for Inf and NaN.
1354 */
1355 unsigned mantissa = lp_mantissa(bld->type);
1356 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1357 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1358 a = LLVMBuildAdd(builder, a, factor, "");
1359 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1360 return a;
1361 #endif
1362 }
1363 else {
1364 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1365 return LLVMBuildShl(builder, a, factor, "");
1366 }
1367 }
1368
1369 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1370 return lp_build_mul(bld, a, factor);
1371 }
1372
1373
1374 /**
1375 * Generate a / b
1376 */
1377 LLVMValueRef
1378 lp_build_div(struct lp_build_context *bld,
1379 LLVMValueRef a,
1380 LLVMValueRef b)
1381 {
1382 LLVMBuilderRef builder = bld->gallivm->builder;
1383 const struct lp_type type = bld->type;
1384
1385 assert(lp_check_value(type, a));
1386 assert(lp_check_value(type, b));
1387
1388 if(a == bld->zero)
1389 return bld->zero;
1390 if(a == bld->one && type.floating)
1391 return lp_build_rcp(bld, b);
1392 if(b == bld->zero)
1393 return bld->undef;
1394 if(b == bld->one)
1395 return a;
1396 if(a == bld->undef || b == bld->undef)
1397 return bld->undef;
1398
1399 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1400 if (type.floating)
1401 return LLVMConstFDiv(a, b);
1402 else if (type.sign)
1403 return LLVMConstSDiv(a, b);
1404 else
1405 return LLVMConstUDiv(a, b);
1406 }
1407
1408 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1409 if(FALSE &&
1410 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1411 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1412 type.floating)
1413 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1414
1415 if (type.floating)
1416 return LLVMBuildFDiv(builder, a, b, "");
1417 else if (type.sign)
1418 return LLVMBuildSDiv(builder, a, b, "");
1419 else
1420 return LLVMBuildUDiv(builder, a, b, "");
1421 }
1422
1423
1424 /**
1425 * Linear interpolation helper.
1426 *
1427 * @param normalized whether we are interpolating normalized values,
1428 * encoded in normalized integers, twice as wide.
1429 *
1430 * @sa http://www.stereopsis.com/doubleblend.html
1431 */
1432 static inline LLVMValueRef
1433 lp_build_lerp_simple(struct lp_build_context *bld,
1434 LLVMValueRef x,
1435 LLVMValueRef v0,
1436 LLVMValueRef v1,
1437 unsigned flags)
1438 {
1439 unsigned half_width = bld->type.width/2;
1440 LLVMBuilderRef builder = bld->gallivm->builder;
1441 LLVMValueRef delta;
1442 LLVMValueRef res;
1443
1444 assert(lp_check_value(bld->type, x));
1445 assert(lp_check_value(bld->type, v0));
1446 assert(lp_check_value(bld->type, v1));
1447
1448 delta = lp_build_sub(bld, v1, v0);
1449
1450 if (bld->type.floating) {
1451 assert(flags == 0);
1452 return lp_build_mad(bld, x, delta, v0);
1453 }
1454
1455 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1456 if (!bld->type.sign) {
1457 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1458 /*
1459 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1460 * most-significant-bit to the lowest-significant-bit, so that
1461 * later we can just divide by 2**n instead of 2**n - 1.
1462 */
1463
1464 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1465 }
1466
1467 /* (x * delta) >> n */
1468 res = lp_build_mul(bld, x, delta);
1469 res = lp_build_shr_imm(bld, res, half_width);
1470 } else {
1471 /*
1472 * The rescaling trick above doesn't work for signed numbers, so
1473 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1474 * instead.
1475 */
1476 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1477 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1478 }
1479 } else {
1480 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1481 res = lp_build_mul(bld, x, delta);
1482 }
1483
1484 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1485 /*
1486 * At this point both res and v0 only use the lower half of the bits,
1487 * the rest is zero. Instead of add / mask, do add with half wide type.
1488 */
1489 struct lp_type narrow_type;
1490 struct lp_build_context narrow_bld;
1491
1492 memset(&narrow_type, 0, sizeof narrow_type);
1493 narrow_type.sign = bld->type.sign;
1494 narrow_type.width = bld->type.width/2;
1495 narrow_type.length = bld->type.length*2;
1496
1497 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1498 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1499 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1500 res = lp_build_add(&narrow_bld, v0, res);
1501 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1502 } else {
1503 res = lp_build_add(bld, v0, res);
1504
1505 if (bld->type.fixed) {
1506 /*
1507 * We need to mask out the high order bits when lerping 8bit
1508 * normalized colors stored on 16bits
1509 */
1510 /* XXX: This step is necessary for lerping 8bit colors stored on
1511 * 16bits, but it will be wrong for true fixed point use cases.
1512 * Basically we need a more powerful lp_type, capable of further
1513 * distinguishing the values interpretation from the value storage.
1514 */
1515 LLVMValueRef low_bits;
1516 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1517 res = LLVMBuildAnd(builder, res, low_bits, "");
1518 }
1519 }
1520
1521 return res;
1522 }
1523
1524
1525 /**
1526 * Linear interpolation.
1527 */
1528 LLVMValueRef
1529 lp_build_lerp(struct lp_build_context *bld,
1530 LLVMValueRef x,
1531 LLVMValueRef v0,
1532 LLVMValueRef v1,
1533 unsigned flags)
1534 {
1535 const struct lp_type type = bld->type;
1536 LLVMValueRef res;
1537
1538 assert(lp_check_value(type, x));
1539 assert(lp_check_value(type, v0));
1540 assert(lp_check_value(type, v1));
1541
1542 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1543
1544 if (type.norm) {
1545 struct lp_type wide_type;
1546 struct lp_build_context wide_bld;
1547 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1548
1549 assert(type.length >= 2);
1550
1551 /*
1552 * Create a wider integer type, enough to hold the
1553 * intermediate result of the multiplication.
1554 */
1555 memset(&wide_type, 0, sizeof wide_type);
1556 wide_type.sign = type.sign;
1557 wide_type.width = type.width*2;
1558 wide_type.length = type.length/2;
1559
1560 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1561
1562 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1563 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1564 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1565
1566 /*
1567 * Lerp both halves.
1568 */
1569
1570 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1571
1572 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1573 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1574
1575 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1576 } else {
1577 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1578 }
1579
1580 return res;
1581 }
1582
1583
1584 /**
1585 * Bilinear interpolation.
1586 *
1587 * Values indices are in v_{yx}.
1588 */
1589 LLVMValueRef
1590 lp_build_lerp_2d(struct lp_build_context *bld,
1591 LLVMValueRef x,
1592 LLVMValueRef y,
1593 LLVMValueRef v00,
1594 LLVMValueRef v01,
1595 LLVMValueRef v10,
1596 LLVMValueRef v11,
1597 unsigned flags)
1598 {
1599 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1600 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1601 return lp_build_lerp(bld, y, v0, v1, flags);
1602 }
1603
1604
1605 LLVMValueRef
1606 lp_build_lerp_3d(struct lp_build_context *bld,
1607 LLVMValueRef x,
1608 LLVMValueRef y,
1609 LLVMValueRef z,
1610 LLVMValueRef v000,
1611 LLVMValueRef v001,
1612 LLVMValueRef v010,
1613 LLVMValueRef v011,
1614 LLVMValueRef v100,
1615 LLVMValueRef v101,
1616 LLVMValueRef v110,
1617 LLVMValueRef v111,
1618 unsigned flags)
1619 {
1620 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1621 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1622 return lp_build_lerp(bld, z, v0, v1, flags);
1623 }
1624
1625
1626 /**
1627 * Generate min(a, b)
1628 * Do checks for special cases but not for nans.
1629 */
1630 LLVMValueRef
1631 lp_build_min(struct lp_build_context *bld,
1632 LLVMValueRef a,
1633 LLVMValueRef b)
1634 {
1635 assert(lp_check_value(bld->type, a));
1636 assert(lp_check_value(bld->type, b));
1637
1638 if(a == bld->undef || b == bld->undef)
1639 return bld->undef;
1640
1641 if(a == b)
1642 return a;
1643
1644 if (bld->type.norm) {
1645 if (!bld->type.sign) {
1646 if (a == bld->zero || b == bld->zero) {
1647 return bld->zero;
1648 }
1649 }
1650 if(a == bld->one)
1651 return b;
1652 if(b == bld->one)
1653 return a;
1654 }
1655
1656 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1657 }
1658
1659
1660 /**
1661 * Generate min(a, b)
1662 * NaN's are handled according to the behavior specified by the
1663 * nan_behavior argument.
1664 */
1665 LLVMValueRef
1666 lp_build_min_ext(struct lp_build_context *bld,
1667 LLVMValueRef a,
1668 LLVMValueRef b,
1669 enum gallivm_nan_behavior nan_behavior)
1670 {
1671 assert(lp_check_value(bld->type, a));
1672 assert(lp_check_value(bld->type, b));
1673
1674 if(a == bld->undef || b == bld->undef)
1675 return bld->undef;
1676
1677 if(a == b)
1678 return a;
1679
1680 if (bld->type.norm) {
1681 if (!bld->type.sign) {
1682 if (a == bld->zero || b == bld->zero) {
1683 return bld->zero;
1684 }
1685 }
1686 if(a == bld->one)
1687 return b;
1688 if(b == bld->one)
1689 return a;
1690 }
1691
1692 return lp_build_min_simple(bld, a, b, nan_behavior);
1693 }
1694
1695 /**
1696 * Generate max(a, b)
1697 * Do checks for special cases, but NaN behavior is undefined.
1698 */
1699 LLVMValueRef
1700 lp_build_max(struct lp_build_context *bld,
1701 LLVMValueRef a,
1702 LLVMValueRef b)
1703 {
1704 assert(lp_check_value(bld->type, a));
1705 assert(lp_check_value(bld->type, b));
1706
1707 if(a == bld->undef || b == bld->undef)
1708 return bld->undef;
1709
1710 if(a == b)
1711 return a;
1712
1713 if(bld->type.norm) {
1714 if(a == bld->one || b == bld->one)
1715 return bld->one;
1716 if (!bld->type.sign) {
1717 if (a == bld->zero) {
1718 return b;
1719 }
1720 if (b == bld->zero) {
1721 return a;
1722 }
1723 }
1724 }
1725
1726 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1727 }
1728
1729
1730 /**
1731 * Generate max(a, b)
1732 * Checks for special cases.
1733 * NaN's are handled according to the behavior specified by the
1734 * nan_behavior argument.
1735 */
1736 LLVMValueRef
1737 lp_build_max_ext(struct lp_build_context *bld,
1738 LLVMValueRef a,
1739 LLVMValueRef b,
1740 enum gallivm_nan_behavior nan_behavior)
1741 {
1742 assert(lp_check_value(bld->type, a));
1743 assert(lp_check_value(bld->type, b));
1744
1745 if(a == bld->undef || b == bld->undef)
1746 return bld->undef;
1747
1748 if(a == b)
1749 return a;
1750
1751 if(bld->type.norm) {
1752 if(a == bld->one || b == bld->one)
1753 return bld->one;
1754 if (!bld->type.sign) {
1755 if (a == bld->zero) {
1756 return b;
1757 }
1758 if (b == bld->zero) {
1759 return a;
1760 }
1761 }
1762 }
1763
1764 return lp_build_max_simple(bld, a, b, nan_behavior);
1765 }
1766
1767 /**
1768 * Generate clamp(a, min, max)
1769 * NaN behavior (for any of a, min, max) is undefined.
1770 * Do checks for special cases.
1771 */
1772 LLVMValueRef
1773 lp_build_clamp(struct lp_build_context *bld,
1774 LLVMValueRef a,
1775 LLVMValueRef min,
1776 LLVMValueRef max)
1777 {
1778 assert(lp_check_value(bld->type, a));
1779 assert(lp_check_value(bld->type, min));
1780 assert(lp_check_value(bld->type, max));
1781
1782 a = lp_build_min(bld, a, max);
1783 a = lp_build_max(bld, a, min);
1784 return a;
1785 }
1786
1787
1788 /**
1789 * Generate clamp(a, 0, 1)
1790 * A NaN will get converted to zero.
1791 */
1792 LLVMValueRef
1793 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1794 LLVMValueRef a)
1795 {
1796 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1797 a = lp_build_min(bld, a, bld->one);
1798 return a;
1799 }
1800
1801
1802 /**
1803 * Generate abs(a)
1804 */
1805 LLVMValueRef
1806 lp_build_abs(struct lp_build_context *bld,
1807 LLVMValueRef a)
1808 {
1809 LLVMBuilderRef builder = bld->gallivm->builder;
1810 const struct lp_type type = bld->type;
1811 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1812
1813 assert(lp_check_value(type, a));
1814
1815 if(!type.sign)
1816 return a;
1817
1818 if(type.floating) {
1819 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1820 /* Workaround llvm.org/PR27332 */
1821 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1822 unsigned long long absMask = ~(1ULL << (type.width - 1));
1823 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1824 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1825 a = LLVMBuildAnd(builder, a, mask, "");
1826 a = LLVMBuildBitCast(builder, a, vec_type, "");
1827 return a;
1828 } else {
1829 char intrinsic[32];
1830 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1831 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1832 }
1833 }
1834
1835 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1836 switch(type.width) {
1837 case 8:
1838 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1839 case 16:
1840 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1841 case 32:
1842 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1843 }
1844 }
1845 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1846 switch(type.width) {
1847 case 8:
1848 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1849 case 16:
1850 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1851 case 32:
1852 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1853 }
1854 }
1855
1856 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1857 a, LLVMBuildNeg(builder, a, ""));
1858 }
1859
1860
1861 LLVMValueRef
1862 lp_build_negate(struct lp_build_context *bld,
1863 LLVMValueRef a)
1864 {
1865 LLVMBuilderRef builder = bld->gallivm->builder;
1866
1867 assert(lp_check_value(bld->type, a));
1868
1869 if (bld->type.floating)
1870 a = LLVMBuildFNeg(builder, a, "");
1871 else
1872 a = LLVMBuildNeg(builder, a, "");
1873
1874 return a;
1875 }
1876
1877
1878 /** Return -1, 0 or +1 depending on the sign of a */
1879 LLVMValueRef
1880 lp_build_sgn(struct lp_build_context *bld,
1881 LLVMValueRef a)
1882 {
1883 LLVMBuilderRef builder = bld->gallivm->builder;
1884 const struct lp_type type = bld->type;
1885 LLVMValueRef cond;
1886 LLVMValueRef res;
1887
1888 assert(lp_check_value(type, a));
1889
1890 /* Handle non-zero case */
1891 if(!type.sign) {
1892 /* if not zero then sign must be positive */
1893 res = bld->one;
1894 }
1895 else if(type.floating) {
1896 LLVMTypeRef vec_type;
1897 LLVMTypeRef int_type;
1898 LLVMValueRef mask;
1899 LLVMValueRef sign;
1900 LLVMValueRef one;
1901 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1902
1903 int_type = lp_build_int_vec_type(bld->gallivm, type);
1904 vec_type = lp_build_vec_type(bld->gallivm, type);
1905 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1906
1907 /* Take the sign bit and add it to 1 constant */
1908 sign = LLVMBuildBitCast(builder, a, int_type, "");
1909 sign = LLVMBuildAnd(builder, sign, mask, "");
1910 one = LLVMConstBitCast(bld->one, int_type);
1911 res = LLVMBuildOr(builder, sign, one, "");
1912 res = LLVMBuildBitCast(builder, res, vec_type, "");
1913 }
1914 else
1915 {
1916 /* signed int/norm/fixed point */
1917 /* could use psign with sse3 and appropriate vectors here */
1918 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1919 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1920 res = lp_build_select(bld, cond, bld->one, minus_one);
1921 }
1922
1923 /* Handle zero */
1924 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1925 res = lp_build_select(bld, cond, bld->zero, res);
1926
1927 return res;
1928 }
1929
1930
1931 /**
1932 * Set the sign of float vector 'a' according to 'sign'.
1933 * If sign==0, return abs(a).
1934 * If sign==1, return -abs(a);
1935 * Other values for sign produce undefined results.
1936 */
1937 LLVMValueRef
1938 lp_build_set_sign(struct lp_build_context *bld,
1939 LLVMValueRef a, LLVMValueRef sign)
1940 {
1941 LLVMBuilderRef builder = bld->gallivm->builder;
1942 const struct lp_type type = bld->type;
1943 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1944 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1946 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1947 ~((unsigned long long) 1 << (type.width - 1)));
1948 LLVMValueRef val, res;
1949
1950 assert(type.floating);
1951 assert(lp_check_value(type, a));
1952
1953 /* val = reinterpret_cast<int>(a) */
1954 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1955 /* val = val & mask */
1956 val = LLVMBuildAnd(builder, val, mask, "");
1957 /* sign = sign << shift */
1958 sign = LLVMBuildShl(builder, sign, shift, "");
1959 /* res = val | sign */
1960 res = LLVMBuildOr(builder, val, sign, "");
1961 /* res = reinterpret_cast<float>(res) */
1962 res = LLVMBuildBitCast(builder, res, vec_type, "");
1963
1964 return res;
1965 }
1966
1967
1968 /**
1969 * Convert vector of (or scalar) int to vector of (or scalar) float.
1970 */
1971 LLVMValueRef
1972 lp_build_int_to_float(struct lp_build_context *bld,
1973 LLVMValueRef a)
1974 {
1975 LLVMBuilderRef builder = bld->gallivm->builder;
1976 const struct lp_type type = bld->type;
1977 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1978
1979 assert(type.floating);
1980
1981 return LLVMBuildSIToFP(builder, a, vec_type, "");
1982 }
1983
1984 static boolean
1985 arch_rounding_available(const struct lp_type type)
1986 {
1987 if ((util_cpu_caps.has_sse4_1 &&
1988 (type.length == 1 || type.width*type.length == 128)) ||
1989 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1990 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1991 return TRUE;
1992 else if ((util_cpu_caps.has_altivec &&
1993 (type.width == 32 && type.length == 4)))
1994 return TRUE;
1995 else if (util_cpu_caps.has_neon)
1996 return TRUE;
1997
1998 return FALSE;
1999 }
2000
2001 enum lp_build_round_mode
2002 {
2003 LP_BUILD_ROUND_NEAREST = 0,
2004 LP_BUILD_ROUND_FLOOR = 1,
2005 LP_BUILD_ROUND_CEIL = 2,
2006 LP_BUILD_ROUND_TRUNCATE = 3
2007 };
2008
2009 static inline LLVMValueRef
2010 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2011 LLVMValueRef a)
2012 {
2013 LLVMBuilderRef builder = bld->gallivm->builder;
2014 const struct lp_type type = bld->type;
2015 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2016 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2017 const char *intrinsic;
2018 LLVMValueRef res;
2019
2020 assert(type.floating);
2021 /* using the double precision conversions is a bit more complicated */
2022 assert(type.width == 32);
2023
2024 assert(lp_check_value(type, a));
2025 assert(util_cpu_caps.has_sse2);
2026
2027 /* This is relying on MXCSR rounding mode, which should always be nearest. */
2028 if (type.length == 1) {
2029 LLVMTypeRef vec_type;
2030 LLVMValueRef undef;
2031 LLVMValueRef arg;
2032 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2033
2034 vec_type = LLVMVectorType(bld->elem_type, 4);
2035
2036 intrinsic = "llvm.x86.sse.cvtss2si";
2037
2038 undef = LLVMGetUndef(vec_type);
2039
2040 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2041
2042 res = lp_build_intrinsic_unary(builder, intrinsic,
2043 ret_type, arg);
2044 }
2045 else {
2046 if (type.width* type.length == 128) {
2047 intrinsic = "llvm.x86.sse2.cvtps2dq";
2048 }
2049 else {
2050 assert(type.width*type.length == 256);
2051 assert(util_cpu_caps.has_avx);
2052
2053 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2054 }
2055 res = lp_build_intrinsic_unary(builder, intrinsic,
2056 ret_type, a);
2057 }
2058
2059 return res;
2060 }
2061
2062
2063 /*
2064 */
2065 static inline LLVMValueRef
2066 lp_build_round_altivec(struct lp_build_context *bld,
2067 LLVMValueRef a,
2068 enum lp_build_round_mode mode)
2069 {
2070 LLVMBuilderRef builder = bld->gallivm->builder;
2071 const struct lp_type type = bld->type;
2072 const char *intrinsic = NULL;
2073
2074 assert(type.floating);
2075
2076 assert(lp_check_value(type, a));
2077 assert(util_cpu_caps.has_altivec);
2078
2079 (void)type;
2080
2081 switch (mode) {
2082 case LP_BUILD_ROUND_NEAREST:
2083 intrinsic = "llvm.ppc.altivec.vrfin";
2084 break;
2085 case LP_BUILD_ROUND_FLOOR:
2086 intrinsic = "llvm.ppc.altivec.vrfim";
2087 break;
2088 case LP_BUILD_ROUND_CEIL:
2089 intrinsic = "llvm.ppc.altivec.vrfip";
2090 break;
2091 case LP_BUILD_ROUND_TRUNCATE:
2092 intrinsic = "llvm.ppc.altivec.vrfiz";
2093 break;
2094 }
2095
2096 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2097 }
2098
2099 static inline LLVMValueRef
2100 lp_build_round_arch(struct lp_build_context *bld,
2101 LLVMValueRef a,
2102 enum lp_build_round_mode mode)
2103 {
2104 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2105 LLVMBuilderRef builder = bld->gallivm->builder;
2106 const struct lp_type type = bld->type;
2107 const char *intrinsic_root;
2108 char intrinsic[32];
2109
2110 assert(type.floating);
2111 assert(lp_check_value(type, a));
2112 (void)type;
2113
2114 switch (mode) {
2115 case LP_BUILD_ROUND_NEAREST:
2116 intrinsic_root = "llvm.nearbyint";
2117 break;
2118 case LP_BUILD_ROUND_FLOOR:
2119 intrinsic_root = "llvm.floor";
2120 break;
2121 case LP_BUILD_ROUND_CEIL:
2122 intrinsic_root = "llvm.ceil";
2123 break;
2124 case LP_BUILD_ROUND_TRUNCATE:
2125 intrinsic_root = "llvm.trunc";
2126 break;
2127 }
2128
2129 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2130 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2131 }
2132 else /* (util_cpu_caps.has_altivec) */
2133 return lp_build_round_altivec(bld, a, mode);
2134 }
2135
2136 /**
2137 * Return the integer part of a float (vector) value (== round toward zero).
2138 * The returned value is a float (vector).
2139 * Ex: trunc(-1.5) = -1.0
2140 */
2141 LLVMValueRef
2142 lp_build_trunc(struct lp_build_context *bld,
2143 LLVMValueRef a)
2144 {
2145 LLVMBuilderRef builder = bld->gallivm->builder;
2146 const struct lp_type type = bld->type;
2147
2148 assert(type.floating);
2149 assert(lp_check_value(type, a));
2150
2151 if (arch_rounding_available(type)) {
2152 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2153 }
2154 else {
2155 const struct lp_type type = bld->type;
2156 struct lp_type inttype;
2157 struct lp_build_context intbld;
2158 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2159 LLVMValueRef trunc, res, anosign, mask;
2160 LLVMTypeRef int_vec_type = bld->int_vec_type;
2161 LLVMTypeRef vec_type = bld->vec_type;
2162
2163 assert(type.width == 32); /* might want to handle doubles at some point */
2164
2165 inttype = type;
2166 inttype.floating = 0;
2167 lp_build_context_init(&intbld, bld->gallivm, inttype);
2168
2169 /* round by truncation */
2170 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2171 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2172
2173 /* mask out sign bit */
2174 anosign = lp_build_abs(bld, a);
2175 /*
2176 * mask out all values if anosign > 2^24
2177 * This should work both for large ints (all rounding is no-op for them
2178 * because such floats are always exact) as well as special cases like
2179 * NaNs, Infs (taking advantage of the fact they use max exponent).
2180 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2181 */
2182 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2183 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2184 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2185 return lp_build_select(bld, mask, a, res);
2186 }
2187 }
2188
2189
2190 /**
2191 * Return float (vector) rounded to nearest integer (vector). The returned
2192 * value is a float (vector).
2193 * Ex: round(0.9) = 1.0
2194 * Ex: round(-1.5) = -2.0
2195 */
2196 LLVMValueRef
2197 lp_build_round(struct lp_build_context *bld,
2198 LLVMValueRef a)
2199 {
2200 LLVMBuilderRef builder = bld->gallivm->builder;
2201 const struct lp_type type = bld->type;
2202
2203 assert(type.floating);
2204 assert(lp_check_value(type, a));
2205
2206 if (arch_rounding_available(type)) {
2207 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2208 }
2209 else {
2210 const struct lp_type type = bld->type;
2211 struct lp_type inttype;
2212 struct lp_build_context intbld;
2213 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2214 LLVMValueRef res, anosign, mask;
2215 LLVMTypeRef int_vec_type = bld->int_vec_type;
2216 LLVMTypeRef vec_type = bld->vec_type;
2217
2218 assert(type.width == 32); /* might want to handle doubles at some point */
2219
2220 inttype = type;
2221 inttype.floating = 0;
2222 lp_build_context_init(&intbld, bld->gallivm, inttype);
2223
2224 res = lp_build_iround(bld, a);
2225 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2226
2227 /* mask out sign bit */
2228 anosign = lp_build_abs(bld, a);
2229 /*
2230 * mask out all values if anosign > 2^24
2231 * This should work both for large ints (all rounding is no-op for them
2232 * because such floats are always exact) as well as special cases like
2233 * NaNs, Infs (taking advantage of the fact they use max exponent).
2234 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2235 */
2236 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2237 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2238 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2239 return lp_build_select(bld, mask, a, res);
2240 }
2241 }
2242
2243
2244 /**
2245 * Return floor of float (vector), result is a float (vector)
2246 * Ex: floor(1.1) = 1.0
2247 * Ex: floor(-1.1) = -2.0
2248 */
2249 LLVMValueRef
2250 lp_build_floor(struct lp_build_context *bld,
2251 LLVMValueRef a)
2252 {
2253 LLVMBuilderRef builder = bld->gallivm->builder;
2254 const struct lp_type type = bld->type;
2255
2256 assert(type.floating);
2257 assert(lp_check_value(type, a));
2258
2259 if (arch_rounding_available(type)) {
2260 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2261 }
2262 else {
2263 const struct lp_type type = bld->type;
2264 struct lp_type inttype;
2265 struct lp_build_context intbld;
2266 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2267 LLVMValueRef trunc, res, anosign, mask;
2268 LLVMTypeRef int_vec_type = bld->int_vec_type;
2269 LLVMTypeRef vec_type = bld->vec_type;
2270
2271 if (type.width != 32) {
2272 char intrinsic[32];
2273 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2274 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2275 }
2276
2277 assert(type.width == 32); /* might want to handle doubles at some point */
2278
2279 inttype = type;
2280 inttype.floating = 0;
2281 lp_build_context_init(&intbld, bld->gallivm, inttype);
2282
2283 /* round by truncation */
2284 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2285 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2286
2287 if (type.sign) {
2288 LLVMValueRef tmp;
2289
2290 /*
2291 * fix values if rounding is wrong (for non-special cases)
2292 * - this is the case if trunc > a
2293 */
2294 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2295 /* tmp = trunc > a ? 1.0 : 0.0 */
2296 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2297 tmp = lp_build_and(&intbld, mask, tmp);
2298 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2299 res = lp_build_sub(bld, res, tmp);
2300 }
2301
2302 /* mask out sign bit */
2303 anosign = lp_build_abs(bld, a);
2304 /*
2305 * mask out all values if anosign > 2^24
2306 * This should work both for large ints (all rounding is no-op for them
2307 * because such floats are always exact) as well as special cases like
2308 * NaNs, Infs (taking advantage of the fact they use max exponent).
2309 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2310 */
2311 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2312 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2313 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2314 return lp_build_select(bld, mask, a, res);
2315 }
2316 }
2317
2318
2319 /**
2320 * Return ceiling of float (vector), returning float (vector).
2321 * Ex: ceil( 1.1) = 2.0
2322 * Ex: ceil(-1.1) = -1.0
2323 */
2324 LLVMValueRef
2325 lp_build_ceil(struct lp_build_context *bld,
2326 LLVMValueRef a)
2327 {
2328 LLVMBuilderRef builder = bld->gallivm->builder;
2329 const struct lp_type type = bld->type;
2330
2331 assert(type.floating);
2332 assert(lp_check_value(type, a));
2333
2334 if (arch_rounding_available(type)) {
2335 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2336 }
2337 else {
2338 const struct lp_type type = bld->type;
2339 struct lp_type inttype;
2340 struct lp_build_context intbld;
2341 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2342 LLVMValueRef trunc, res, anosign, mask, tmp;
2343 LLVMTypeRef int_vec_type = bld->int_vec_type;
2344 LLVMTypeRef vec_type = bld->vec_type;
2345
2346 if (type.width != 32) {
2347 char intrinsic[32];
2348 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2349 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2350 }
2351
2352 assert(type.width == 32); /* might want to handle doubles at some point */
2353
2354 inttype = type;
2355 inttype.floating = 0;
2356 lp_build_context_init(&intbld, bld->gallivm, inttype);
2357
2358 /* round by truncation */
2359 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2360 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2361
2362 /*
2363 * fix values if rounding is wrong (for non-special cases)
2364 * - this is the case if trunc < a
2365 */
2366 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2367 /* tmp = trunc < a ? 1.0 : 0.0 */
2368 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2369 tmp = lp_build_and(&intbld, mask, tmp);
2370 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2371 res = lp_build_add(bld, trunc, tmp);
2372
2373 /* mask out sign bit */
2374 anosign = lp_build_abs(bld, a);
2375 /*
2376 * mask out all values if anosign > 2^24
2377 * This should work both for large ints (all rounding is no-op for them
2378 * because such floats are always exact) as well as special cases like
2379 * NaNs, Infs (taking advantage of the fact they use max exponent).
2380 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2381 */
2382 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2383 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2384 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2385 return lp_build_select(bld, mask, a, res);
2386 }
2387 }
2388
2389
2390 /**
2391 * Return fractional part of 'a' computed as a - floor(a)
2392 * Typically used in texture coord arithmetic.
2393 */
2394 LLVMValueRef
2395 lp_build_fract(struct lp_build_context *bld,
2396 LLVMValueRef a)
2397 {
2398 assert(bld->type.floating);
2399 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2400 }
2401
2402
2403 /**
2404 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2405 * against 0.99999(9). (Will also return that value for NaNs.)
2406 */
2407 static inline LLVMValueRef
2408 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2409 {
2410 LLVMValueRef max;
2411
2412 /* this is the largest number smaller than 1.0 representable as float */
2413 max = lp_build_const_vec(bld->gallivm, bld->type,
2414 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2415 return lp_build_min_ext(bld, fract, max,
2416 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2417 }
2418
2419
2420 /**
2421 * Same as lp_build_fract, but guarantees that the result is always smaller
2422 * than one. Will also return the smaller-than-one value for infs, NaNs.
2423 */
2424 LLVMValueRef
2425 lp_build_fract_safe(struct lp_build_context *bld,
2426 LLVMValueRef a)
2427 {
2428 return clamp_fract(bld, lp_build_fract(bld, a));
2429 }
2430
2431
2432 /**
2433 * Return the integer part of a float (vector) value (== round toward zero).
2434 * The returned value is an integer (vector).
2435 * Ex: itrunc(-1.5) = -1
2436 */
2437 LLVMValueRef
2438 lp_build_itrunc(struct lp_build_context *bld,
2439 LLVMValueRef a)
2440 {
2441 LLVMBuilderRef builder = bld->gallivm->builder;
2442 const struct lp_type type = bld->type;
2443 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2444
2445 assert(type.floating);
2446 assert(lp_check_value(type, a));
2447
2448 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2449 }
2450
2451
2452 /**
2453 * Return float (vector) rounded to nearest integer (vector). The returned
2454 * value is an integer (vector).
2455 * Ex: iround(0.9) = 1
2456 * Ex: iround(-1.5) = -2
2457 */
2458 LLVMValueRef
2459 lp_build_iround(struct lp_build_context *bld,
2460 LLVMValueRef a)
2461 {
2462 LLVMBuilderRef builder = bld->gallivm->builder;
2463 const struct lp_type type = bld->type;
2464 LLVMTypeRef int_vec_type = bld->int_vec_type;
2465 LLVMValueRef res;
2466
2467 assert(type.floating);
2468
2469 assert(lp_check_value(type, a));
2470
2471 if ((util_cpu_caps.has_sse2 &&
2472 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2473 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2474 return lp_build_iround_nearest_sse2(bld, a);
2475 }
2476 if (arch_rounding_available(type)) {
2477 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2478 }
2479 else {
2480 LLVMValueRef half;
2481
2482 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2483
2484 if (type.sign) {
2485 LLVMTypeRef vec_type = bld->vec_type;
2486 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2487 (unsigned long long)1 << (type.width - 1));
2488 LLVMValueRef sign;
2489
2490 /* get sign bit */
2491 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2492 sign = LLVMBuildAnd(builder, sign, mask, "");
2493
2494 /* sign * 0.5 */
2495 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2496 half = LLVMBuildOr(builder, sign, half, "");
2497 half = LLVMBuildBitCast(builder, half, vec_type, "");
2498 }
2499
2500 res = LLVMBuildFAdd(builder, a, half, "");
2501 }
2502
2503 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2504
2505 return res;
2506 }
2507
2508
2509 /**
2510 * Return floor of float (vector), result is an int (vector)
2511 * Ex: ifloor(1.1) = 1.0
2512 * Ex: ifloor(-1.1) = -2.0
2513 */
2514 LLVMValueRef
2515 lp_build_ifloor(struct lp_build_context *bld,
2516 LLVMValueRef a)
2517 {
2518 LLVMBuilderRef builder = bld->gallivm->builder;
2519 const struct lp_type type = bld->type;
2520 LLVMTypeRef int_vec_type = bld->int_vec_type;
2521 LLVMValueRef res;
2522
2523 assert(type.floating);
2524 assert(lp_check_value(type, a));
2525
2526 res = a;
2527 if (type.sign) {
2528 if (arch_rounding_available(type)) {
2529 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2530 }
2531 else {
2532 struct lp_type inttype;
2533 struct lp_build_context intbld;
2534 LLVMValueRef trunc, itrunc, mask;
2535
2536 assert(type.floating);
2537 assert(lp_check_value(type, a));
2538
2539 inttype = type;
2540 inttype.floating = 0;
2541 lp_build_context_init(&intbld, bld->gallivm, inttype);
2542
2543 /* round by truncation */
2544 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2545 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2546
2547 /*
2548 * fix values if rounding is wrong (for non-special cases)
2549 * - this is the case if trunc > a
2550 * The results of doing this with NaNs, very large values etc.
2551 * are undefined but this seems to be the case anyway.
2552 */
2553 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2554 /* cheapie minus one with mask since the mask is minus one / zero */
2555 return lp_build_add(&intbld, itrunc, mask);
2556 }
2557 }
2558
2559 /* round to nearest (toward zero) */
2560 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2561
2562 return res;
2563 }
2564
2565
2566 /**
2567 * Return ceiling of float (vector), returning int (vector).
2568 * Ex: iceil( 1.1) = 2
2569 * Ex: iceil(-1.1) = -1
2570 */
2571 LLVMValueRef
2572 lp_build_iceil(struct lp_build_context *bld,
2573 LLVMValueRef a)
2574 {
2575 LLVMBuilderRef builder = bld->gallivm->builder;
2576 const struct lp_type type = bld->type;
2577 LLVMTypeRef int_vec_type = bld->int_vec_type;
2578 LLVMValueRef res;
2579
2580 assert(type.floating);
2581 assert(lp_check_value(type, a));
2582
2583 if (arch_rounding_available(type)) {
2584 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2585 }
2586 else {
2587 struct lp_type inttype;
2588 struct lp_build_context intbld;
2589 LLVMValueRef trunc, itrunc, mask;
2590
2591 assert(type.floating);
2592 assert(lp_check_value(type, a));
2593
2594 inttype = type;
2595 inttype.floating = 0;
2596 lp_build_context_init(&intbld, bld->gallivm, inttype);
2597
2598 /* round by truncation */
2599 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2600 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2601
2602 /*
2603 * fix values if rounding is wrong (for non-special cases)
2604 * - this is the case if trunc < a
2605 * The results of doing this with NaNs, very large values etc.
2606 * are undefined but this seems to be the case anyway.
2607 */
2608 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2609 /* cheapie plus one with mask since the mask is minus one / zero */
2610 return lp_build_sub(&intbld, itrunc, mask);
2611 }
2612
2613 /* round to nearest (toward zero) */
2614 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2615
2616 return res;
2617 }
2618
2619
2620 /**
2621 * Combined ifloor() & fract().
2622 *
2623 * Preferred to calling the functions separately, as it will ensure that the
2624 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2625 */
2626 void
2627 lp_build_ifloor_fract(struct lp_build_context *bld,
2628 LLVMValueRef a,
2629 LLVMValueRef *out_ipart,
2630 LLVMValueRef *out_fpart)
2631 {
2632 LLVMBuilderRef builder = bld->gallivm->builder;
2633 const struct lp_type type = bld->type;
2634 LLVMValueRef ipart;
2635
2636 assert(type.floating);
2637 assert(lp_check_value(type, a));
2638
2639 if (arch_rounding_available(type)) {
2640 /*
2641 * floor() is easier.
2642 */
2643
2644 ipart = lp_build_floor(bld, a);
2645 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2646 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2647 }
2648 else {
2649 /*
2650 * ifloor() is easier.
2651 */
2652
2653 *out_ipart = lp_build_ifloor(bld, a);
2654 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2655 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2656 }
2657 }
2658
2659
2660 /**
2661 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2662 * always smaller than one.
2663 */
2664 void
2665 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2666 LLVMValueRef a,
2667 LLVMValueRef *out_ipart,
2668 LLVMValueRef *out_fpart)
2669 {
2670 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2671 *out_fpart = clamp_fract(bld, *out_fpart);
2672 }
2673
2674
2675 LLVMValueRef
2676 lp_build_sqrt(struct lp_build_context *bld,
2677 LLVMValueRef a)
2678 {
2679 LLVMBuilderRef builder = bld->gallivm->builder;
2680 const struct lp_type type = bld->type;
2681 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2682 char intrinsic[32];
2683
2684 assert(lp_check_value(type, a));
2685
2686 assert(type.floating);
2687 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2688
2689 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2690 }
2691
2692
2693 /**
2694 * Do one Newton-Raphson step to improve reciprocate precision:
2695 *
2696 * x_{i+1} = x_i * (2 - a * x_i)
2697 *
2698 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2699 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2700 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2701 * halo. It would be necessary to clamp the argument to prevent this.
2702 *
2703 * See also:
2704 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2705 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2706 */
2707 static inline LLVMValueRef
2708 lp_build_rcp_refine(struct lp_build_context *bld,
2709 LLVMValueRef a,
2710 LLVMValueRef rcp_a)
2711 {
2712 LLVMBuilderRef builder = bld->gallivm->builder;
2713 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2714 LLVMValueRef res;
2715
2716 res = LLVMBuildFMul(builder, a, rcp_a, "");
2717 res = LLVMBuildFSub(builder, two, res, "");
2718 res = LLVMBuildFMul(builder, rcp_a, res, "");
2719
2720 return res;
2721 }
2722
2723
2724 LLVMValueRef
2725 lp_build_rcp(struct lp_build_context *bld,
2726 LLVMValueRef a)
2727 {
2728 LLVMBuilderRef builder = bld->gallivm->builder;
2729 const struct lp_type type = bld->type;
2730
2731 assert(lp_check_value(type, a));
2732
2733 if(a == bld->zero)
2734 return bld->undef;
2735 if(a == bld->one)
2736 return bld->one;
2737 if(a == bld->undef)
2738 return bld->undef;
2739
2740 assert(type.floating);
2741
2742 if(LLVMIsConstant(a))
2743 return LLVMConstFDiv(bld->one, a);
2744
2745 /*
2746 * We don't use RCPPS because:
2747 * - it only has 10bits of precision
2748 * - it doesn't even get the reciprocate of 1.0 exactly
2749 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2750 * - for recent processors the benefit over DIVPS is marginal, a case
2751 * dependent
2752 *
2753 * We could still use it on certain processors if benchmarks show that the
2754 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2755 * particular uses that require less workarounds.
2756 */
2757
2758 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2759 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2760 const unsigned num_iterations = 0;
2761 LLVMValueRef res;
2762 unsigned i;
2763 const char *intrinsic = NULL;
2764
2765 if (type.length == 4) {
2766 intrinsic = "llvm.x86.sse.rcp.ps";
2767 }
2768 else {
2769 intrinsic = "llvm.x86.avx.rcp.ps.256";
2770 }
2771
2772 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2773
2774 for (i = 0; i < num_iterations; ++i) {
2775 res = lp_build_rcp_refine(bld, a, res);
2776 }
2777
2778 return res;
2779 }
2780
2781 return LLVMBuildFDiv(builder, bld->one, a, "");
2782 }
2783
2784
2785 /**
2786 * Do one Newton-Raphson step to improve rsqrt precision:
2787 *
2788 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2789 *
2790 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2791 */
2792 static inline LLVMValueRef
2793 lp_build_rsqrt_refine(struct lp_build_context *bld,
2794 LLVMValueRef a,
2795 LLVMValueRef rsqrt_a)
2796 {
2797 LLVMBuilderRef builder = bld->gallivm->builder;
2798 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2799 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2800 LLVMValueRef res;
2801
2802 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2803 res = LLVMBuildFMul(builder, a, res, "");
2804 res = LLVMBuildFSub(builder, three, res, "");
2805 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2806 res = LLVMBuildFMul(builder, half, res, "");
2807
2808 return res;
2809 }
2810
2811
2812 /**
2813 * Generate 1/sqrt(a).
2814 * Result is undefined for values < 0, infinity for +0.
2815 */
2816 LLVMValueRef
2817 lp_build_rsqrt(struct lp_build_context *bld,
2818 LLVMValueRef a)
2819 {
2820 const struct lp_type type = bld->type;
2821
2822 assert(lp_check_value(type, a));
2823
2824 assert(type.floating);
2825
2826 /*
2827 * This should be faster but all denormals will end up as infinity.
2828 */
2829 if (0 && lp_build_fast_rsqrt_available(type)) {
2830 const unsigned num_iterations = 1;
2831 LLVMValueRef res;
2832 unsigned i;
2833
2834 /* rsqrt(1.0) != 1.0 here */
2835 res = lp_build_fast_rsqrt(bld, a);
2836
2837 if (num_iterations) {
2838 /*
2839 * Newton-Raphson will result in NaN instead of infinity for zero,
2840 * and NaN instead of zero for infinity.
2841 * Also, need to ensure rsqrt(1.0) == 1.0.
2842 * All numbers smaller than FLT_MIN will result in +infinity
2843 * (rsqrtps treats all denormals as zero).
2844 */
2845 LLVMValueRef cmp;
2846 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2847 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2848
2849 for (i = 0; i < num_iterations; ++i) {
2850 res = lp_build_rsqrt_refine(bld, a, res);
2851 }
2852 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2853 res = lp_build_select(bld, cmp, inf, res);
2854 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2855 res = lp_build_select(bld, cmp, bld->zero, res);
2856 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2857 res = lp_build_select(bld, cmp, bld->one, res);
2858 }
2859
2860 return res;
2861 }
2862
2863 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2864 }
2865
2866 /**
2867 * If there's a fast (inaccurate) rsqrt instruction available
2868 * (caller may want to avoid to call rsqrt_fast if it's not available,
2869 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2870 * unavailable it would result in sqrt/div/mul so obviously
2871 * much better to just call sqrt, skipping both div and mul).
2872 */
2873 boolean
2874 lp_build_fast_rsqrt_available(struct lp_type type)
2875 {
2876 assert(type.floating);
2877
2878 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2879 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2880 return true;
2881 }
2882 return false;
2883 }
2884
2885
2886 /**
2887 * Generate 1/sqrt(a).
2888 * Result is undefined for values < 0, infinity for +0.
2889 * Precision is limited, only ~10 bits guaranteed
2890 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2891 */
2892 LLVMValueRef
2893 lp_build_fast_rsqrt(struct lp_build_context *bld,
2894 LLVMValueRef a)
2895 {
2896 LLVMBuilderRef builder = bld->gallivm->builder;
2897 const struct lp_type type = bld->type;
2898
2899 assert(lp_check_value(type, a));
2900
2901 if (lp_build_fast_rsqrt_available(type)) {
2902 const char *intrinsic = NULL;
2903
2904 if (type.length == 4) {
2905 intrinsic = "llvm.x86.sse.rsqrt.ps";
2906 }
2907 else {
2908 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2909 }
2910 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2911 }
2912 else {
2913 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2914 }
2915 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2916 }
2917
2918
2919 /**
2920 * Generate sin(a) or cos(a) using polynomial approximation.
2921 * TODO: it might be worth recognizing sin and cos using same source
2922 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2923 * would be way cheaper than calculating (nearly) everything twice...
2924 * Not sure it's common enough to be worth bothering however, scs
2925 * opcode could also benefit from calculating both though.
2926 */
2927 static LLVMValueRef
2928 lp_build_sin_or_cos(struct lp_build_context *bld,
2929 LLVMValueRef a,
2930 boolean cos)
2931 {
2932 struct gallivm_state *gallivm = bld->gallivm;
2933 LLVMBuilderRef b = gallivm->builder;
2934 struct lp_type int_type = lp_int_type(bld->type);
2935
2936 /*
2937 * take the absolute value,
2938 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2939 */
2940
2941 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2942 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2943
2944 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2945 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2946
2947 /*
2948 * scale by 4/Pi
2949 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2950 */
2951
2952 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2953 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2954
2955 /*
2956 * store the integer part of y in mm0
2957 * emm2 = _mm_cvttps_epi32(y);
2958 */
2959
2960 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2961
2962 /*
2963 * j=(j+1) & (~1) (see the cephes sources)
2964 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2965 */
2966
2967 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2968 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2969 /*
2970 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2971 */
2972 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2973 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2974
2975 /*
2976 * y = _mm_cvtepi32_ps(emm2);
2977 */
2978 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2979
2980 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2981 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2982 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2983 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2984
2985 /*
2986 * Argument used for poly selection and sign bit determination
2987 * is different for sin vs. cos.
2988 */
2989 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2990 emm2_and;
2991
2992 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2993 LLVMBuildNot(b, emm2_2, ""), ""),
2994 const_29, "sign_bit") :
2995 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2996 LLVMBuildShl(b, emm2_add,
2997 const_29, ""), ""),
2998 sign_mask, "sign_bit");
2999
3000 /*
3001 * get the polynom selection mask
3002 * there is one polynom for 0 <= x <= Pi/4
3003 * and another one for Pi/4<x<=Pi/2
3004 * Both branches will be computed.
3005 *
3006 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3007 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3008 */
3009
3010 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3011 LLVMValueRef poly_mask = lp_build_compare(gallivm,
3012 int_type, PIPE_FUNC_EQUAL,
3013 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3014
3015 /*
3016 * _PS_CONST(minus_cephes_DP1, -0.78515625);
3017 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3018 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3019 */
3020 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3021 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3022 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3023
3024 /*
3025 * The magic pass: "Extended precision modular arithmetic"
3026 * x = ((x - y * DP1) - y * DP2) - y * DP3;
3027 */
3028 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3029 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3030 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3031
3032 /*
3033 * Evaluate the first polynom (0 <= x <= Pi/4)
3034 *
3035 * z = _mm_mul_ps(x,x);
3036 */
3037 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3038
3039 /*
3040 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3041 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3042 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3043 */
3044 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3045 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3046 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3047
3048 /*
3049 * y = *(v4sf*)_ps_coscof_p0;
3050 * y = _mm_mul_ps(y, z);
3051 */
3052 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3053 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3054 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3055 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3056
3057
3058 /*
3059 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3060 * y = _mm_sub_ps(y, tmp);
3061 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3062 */
3063 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3064 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3065 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3066 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3067 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3068
3069 /*
3070 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3071 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3072 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3073 */
3074 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3075 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3076 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3077
3078 /*
3079 * Evaluate the second polynom (Pi/4 <= x <= 0)
3080 *
3081 * y2 = *(v4sf*)_ps_sincof_p0;
3082 * y2 = _mm_mul_ps(y2, z);
3083 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3084 * y2 = _mm_mul_ps(y2, z);
3085 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3086 * y2 = _mm_mul_ps(y2, z);
3087 * y2 = _mm_mul_ps(y2, x);
3088 * y2 = _mm_add_ps(y2, x);
3089 */
3090
3091 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3092 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3093 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3094 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3095
3096 /*
3097 * select the correct result from the two polynoms
3098 * xmm3 = poly_mask;
3099 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3100 * y = _mm_andnot_ps(xmm3, y);
3101 * y = _mm_or_ps(y,y2);
3102 */
3103 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3104 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3105 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3106 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3107 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3108 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3109
3110 /*
3111 * update the sign
3112 * y = _mm_xor_ps(y, sign_bit);
3113 */
3114 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3115 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3116
3117 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3118
3119 /* clamp output to be within [-1, 1] */
3120 y_result = lp_build_clamp(bld, y_result,
3121 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3122 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3123 /* If a is -inf, inf or NaN then return NaN */
3124 y_result = lp_build_select(bld, isfinite, y_result,
3125 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3126 return y_result;
3127 }
3128
3129
3130 /**
3131 * Generate sin(a)
3132 */
3133 LLVMValueRef
3134 lp_build_sin(struct lp_build_context *bld,
3135 LLVMValueRef a)
3136 {
3137 return lp_build_sin_or_cos(bld, a, FALSE);
3138 }
3139
3140
3141 /**
3142 * Generate cos(a)
3143 */
3144 LLVMValueRef
3145 lp_build_cos(struct lp_build_context *bld,
3146 LLVMValueRef a)
3147 {
3148 return lp_build_sin_or_cos(bld, a, TRUE);
3149 }
3150
3151
3152 /**
3153 * Generate pow(x, y)
3154 */
3155 LLVMValueRef
3156 lp_build_pow(struct lp_build_context *bld,
3157 LLVMValueRef x,
3158 LLVMValueRef y)
3159 {
3160 /* TODO: optimize the constant case */
3161 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3162 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3163 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3164 __FUNCTION__);
3165 }
3166
3167 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3168 }
3169
3170
3171 /**
3172 * Generate exp(x)
3173 */
3174 LLVMValueRef
3175 lp_build_exp(struct lp_build_context *bld,
3176 LLVMValueRef x)
3177 {
3178 /* log2(e) = 1/log(2) */
3179 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3180 1.4426950408889634);
3181
3182 assert(lp_check_value(bld->type, x));
3183
3184 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3185 }
3186
3187
3188 /**
3189 * Generate log(x)
3190 * Behavior is undefined with infs, 0s and nans
3191 */
3192 LLVMValueRef
3193 lp_build_log(struct lp_build_context *bld,
3194 LLVMValueRef x)
3195 {
3196 /* log(2) */
3197 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3198 0.69314718055994529);
3199
3200 assert(lp_check_value(bld->type, x));
3201
3202 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3203 }
3204
3205 /**
3206 * Generate log(x) that handles edge cases (infs, 0s and nans)
3207 */
3208 LLVMValueRef
3209 lp_build_log_safe(struct lp_build_context *bld,
3210 LLVMValueRef x)
3211 {
3212 /* log(2) */
3213 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3214 0.69314718055994529);
3215
3216 assert(lp_check_value(bld->type, x));
3217
3218 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3219 }
3220
3221
3222 /**
3223 * Generate polynomial.
3224 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3225 */
3226 LLVMValueRef
3227 lp_build_polynomial(struct lp_build_context *bld,
3228 LLVMValueRef x,
3229 const double *coeffs,
3230 unsigned num_coeffs)
3231 {
3232 const struct lp_type type = bld->type;
3233 LLVMValueRef even = NULL, odd = NULL;
3234 LLVMValueRef x2;
3235 unsigned i;
3236
3237 assert(lp_check_value(bld->type, x));
3238
3239 /* TODO: optimize the constant case */
3240 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3241 LLVMIsConstant(x)) {
3242 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3243 __FUNCTION__);
3244 }
3245
3246 /*
3247 * Calculate odd and even terms seperately to decrease data dependency
3248 * Ex:
3249 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3250 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3251 */
3252 x2 = lp_build_mul(bld, x, x);
3253
3254 for (i = num_coeffs; i--; ) {
3255 LLVMValueRef coeff;
3256
3257 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3258
3259 if (i % 2 == 0) {
3260 if (even)
3261 even = lp_build_mad(bld, x2, even, coeff);
3262 else
3263 even = coeff;
3264 } else {
3265 if (odd)
3266 odd = lp_build_mad(bld, x2, odd, coeff);
3267 else
3268 odd = coeff;
3269 }
3270 }
3271
3272 if (odd)
3273 return lp_build_mad(bld, odd, x, even);
3274 else if (even)
3275 return even;
3276 else
3277 return bld->undef;
3278 }
3279
3280
3281 /**
3282 * Minimax polynomial fit of 2**x, in range [0, 1[
3283 */
3284 const double lp_build_exp2_polynomial[] = {
3285 #if EXP_POLY_DEGREE == 5
3286 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3287 0.693153073200168932794,
3288 0.240153617044375388211,
3289 0.0558263180532956664775,
3290 0.00898934009049466391101,
3291 0.00187757667519147912699
3292 #elif EXP_POLY_DEGREE == 4
3293 1.00000259337069434683,
3294 0.693003834469974940458,
3295 0.24144275689150793076,
3296 0.0520114606103070150235,
3297 0.0135341679161270268764
3298 #elif EXP_POLY_DEGREE == 3
3299 0.999925218562710312959,
3300 0.695833540494823811697,
3301 0.226067155427249155588,
3302 0.0780245226406372992967
3303 #elif EXP_POLY_DEGREE == 2
3304 1.00172476321474503578,
3305 0.657636275736077639316,
3306 0.33718943461968720704
3307 #else
3308 #error
3309 #endif
3310 };
3311
3312
3313 LLVMValueRef
3314 lp_build_exp2(struct lp_build_context *bld,
3315 LLVMValueRef x)
3316 {
3317 LLVMBuilderRef builder = bld->gallivm->builder;
3318 const struct lp_type type = bld->type;
3319 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3320 LLVMValueRef ipart = NULL;
3321 LLVMValueRef fpart = NULL;
3322 LLVMValueRef expipart = NULL;
3323 LLVMValueRef expfpart = NULL;
3324 LLVMValueRef res = NULL;
3325
3326 assert(lp_check_value(bld->type, x));
3327
3328 /* TODO: optimize the constant case */
3329 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3330 LLVMIsConstant(x)) {
3331 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3332 __FUNCTION__);
3333 }
3334
3335 assert(type.floating && type.width == 32);
3336
3337 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3338 * the result is INF and if it's smaller than -126.9 the result is 0 */
3339 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3340 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3341 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3342 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3343
3344 /* ipart = floor(x) */
3345 /* fpart = x - ipart */
3346 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3347
3348 /* expipart = (float) (1 << ipart) */
3349 expipart = LLVMBuildAdd(builder, ipart,
3350 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3351 expipart = LLVMBuildShl(builder, expipart,
3352 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3353 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3354
3355 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3356 ARRAY_SIZE(lp_build_exp2_polynomial));
3357
3358 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3359
3360 return res;
3361 }
3362
3363
3364
3365 /**
3366 * Extract the exponent of a IEEE-754 floating point value.
3367 *
3368 * Optionally apply an integer bias.
3369 *
3370 * Result is an integer value with
3371 *
3372 * ifloor(log2(x)) + bias
3373 */
3374 LLVMValueRef
3375 lp_build_extract_exponent(struct lp_build_context *bld,
3376 LLVMValueRef x,
3377 int bias)
3378 {
3379 LLVMBuilderRef builder = bld->gallivm->builder;
3380 const struct lp_type type = bld->type;
3381 unsigned mantissa = lp_mantissa(type);
3382 LLVMValueRef res;
3383
3384 assert(type.floating);
3385
3386 assert(lp_check_value(bld->type, x));
3387
3388 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3389
3390 res = LLVMBuildLShr(builder, x,
3391 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3392 res = LLVMBuildAnd(builder, res,
3393 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3394 res = LLVMBuildSub(builder, res,
3395 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3396
3397 return res;
3398 }
3399
3400
3401 /**
3402 * Extract the mantissa of the a floating.
3403 *
3404 * Result is a floating point value with
3405 *
3406 * x / floor(log2(x))
3407 */
3408 LLVMValueRef
3409 lp_build_extract_mantissa(struct lp_build_context *bld,
3410 LLVMValueRef x)
3411 {
3412 LLVMBuilderRef builder = bld->gallivm->builder;
3413 const struct lp_type type = bld->type;
3414 unsigned mantissa = lp_mantissa(type);
3415 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3416 (1ULL << mantissa) - 1);
3417 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3418 LLVMValueRef res;
3419
3420 assert(lp_check_value(bld->type, x));
3421
3422 assert(type.floating);
3423
3424 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3425
3426 /* res = x / 2**ipart */
3427 res = LLVMBuildAnd(builder, x, mantmask, "");
3428 res = LLVMBuildOr(builder, res, one, "");
3429 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3430
3431 return res;
3432 }
3433
3434
3435
3436 /**
3437 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3438 * These coefficients can be generate with
3439 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3440 */
3441 const double lp_build_log2_polynomial[] = {
3442 #if LOG_POLY_DEGREE == 5
3443 2.88539008148777786488L,
3444 0.961796878841293367824L,
3445 0.577058946784739859012L,
3446 0.412914355135828735411L,
3447 0.308591899232910175289L,
3448 0.352376952300281371868L,
3449 #elif LOG_POLY_DEGREE == 4
3450 2.88539009343309178325L,
3451 0.961791550404184197881L,
3452 0.577440339438736392009L,
3453 0.403343858251329912514L,
3454 0.406718052498846252698L,
3455 #elif LOG_POLY_DEGREE == 3
3456 2.88538959748872753838L,
3457 0.961932915889597772928L,
3458 0.571118517972136195241L,
3459 0.493997535084709500285L,
3460 #else
3461 #error
3462 #endif
3463 };
3464
3465 /**
3466 * See http://www.devmaster.net/forums/showthread.php?p=43580
3467 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3468 * http://www.nezumi.demon.co.uk/consult/logx.htm
3469 *
3470 * If handle_edge_cases is true the function will perform computations
3471 * to match the required D3D10+ behavior for each of the edge cases.
3472 * That means that if input is:
3473 * - less than zero (to and including -inf) then NaN will be returned
3474 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3475 * - +infinity, then +infinity will be returned
3476 * - NaN, then NaN will be returned
3477 *
3478 * Those checks are fairly expensive so if you don't need them make sure
3479 * handle_edge_cases is false.
3480 */
3481 void
3482 lp_build_log2_approx(struct lp_build_context *bld,
3483 LLVMValueRef x,
3484 LLVMValueRef *p_exp,
3485 LLVMValueRef *p_floor_log2,
3486 LLVMValueRef *p_log2,
3487 boolean handle_edge_cases)
3488 {
3489 LLVMBuilderRef builder = bld->gallivm->builder;
3490 const struct lp_type type = bld->type;
3491 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3492 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3493
3494 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3495 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3496 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3497
3498 LLVMValueRef i = NULL;
3499 LLVMValueRef y = NULL;
3500 LLVMValueRef z = NULL;
3501 LLVMValueRef exp = NULL;
3502 LLVMValueRef mant = NULL;
3503 LLVMValueRef logexp = NULL;
3504 LLVMValueRef p_z = NULL;
3505 LLVMValueRef res = NULL;
3506
3507 assert(lp_check_value(bld->type, x));
3508
3509 if(p_exp || p_floor_log2 || p_log2) {
3510 /* TODO: optimize the constant case */
3511 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3512 LLVMIsConstant(x)) {
3513 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3514 __FUNCTION__);
3515 }
3516
3517 assert(type.floating && type.width == 32);
3518
3519 /*
3520 * We don't explicitly handle denormalized numbers. They will yield a
3521 * result in the neighbourhood of -127, which appears to be adequate
3522 * enough.
3523 */
3524
3525 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3526
3527 /* exp = (float) exponent(x) */
3528 exp = LLVMBuildAnd(builder, i, expmask, "");
3529 }
3530
3531 if(p_floor_log2 || p_log2) {
3532 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3533 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3534 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3535 }
3536
3537 if (p_log2) {
3538 /* mant = 1 + (float) mantissa(x) */
3539 mant = LLVMBuildAnd(builder, i, mantmask, "");
3540 mant = LLVMBuildOr(builder, mant, one, "");
3541 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3542
3543 /* y = (mant - 1) / (mant + 1) */
3544 y = lp_build_div(bld,
3545 lp_build_sub(bld, mant, bld->one),
3546 lp_build_add(bld, mant, bld->one)
3547 );
3548
3549 /* z = y^2 */
3550 z = lp_build_mul(bld, y, y);
3551
3552 /* compute P(z) */
3553 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3554 ARRAY_SIZE(lp_build_log2_polynomial));
3555
3556 /* y * P(z) + logexp */
3557 res = lp_build_mad(bld, y, p_z, logexp);
3558
3559 if (type.floating && handle_edge_cases) {
3560 LLVMValueRef negmask, infmask, zmask;
3561 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3562 lp_build_const_vec(bld->gallivm, type, 0.0f));
3563 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3564 lp_build_const_vec(bld->gallivm, type, 0.0f));
3565 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3566 lp_build_const_vec(bld->gallivm, type, INFINITY));
3567
3568 /* If x is qual to inf make sure we return inf */
3569 res = lp_build_select(bld, infmask,
3570 lp_build_const_vec(bld->gallivm, type, INFINITY),
3571 res);
3572 /* If x is qual to 0, return -inf */
3573 res = lp_build_select(bld, zmask,
3574 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3575 res);
3576 /* If x is nan or less than 0, return nan */
3577 res = lp_build_select(bld, negmask,
3578 lp_build_const_vec(bld->gallivm, type, NAN),
3579 res);
3580 }
3581 }
3582
3583 if (p_exp) {
3584 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3585 *p_exp = exp;
3586 }
3587
3588 if (p_floor_log2)
3589 *p_floor_log2 = logexp;
3590
3591 if (p_log2)
3592 *p_log2 = res;
3593 }
3594
3595
3596 /*
3597 * log2 implementation which doesn't have special code to
3598 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3599 * the results for those cases are undefined.
3600 */
3601 LLVMValueRef
3602 lp_build_log2(struct lp_build_context *bld,
3603 LLVMValueRef x)
3604 {
3605 LLVMValueRef res;
3606 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3607 return res;
3608 }
3609
3610 /*
3611 * Version of log2 which handles all edge cases.
3612 * Look at documentation of lp_build_log2_approx for
3613 * description of the behavior for each of the edge cases.
3614 */
3615 LLVMValueRef
3616 lp_build_log2_safe(struct lp_build_context *bld,
3617 LLVMValueRef x)
3618 {
3619 LLVMValueRef res;
3620 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3621 return res;
3622 }
3623
3624
3625 /**
3626 * Faster (and less accurate) log2.
3627 *
3628 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3629 *
3630 * Piece-wise linear approximation, with exact results when x is a
3631 * power of two.
3632 *
3633 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3634 */
3635 LLVMValueRef
3636 lp_build_fast_log2(struct lp_build_context *bld,
3637 LLVMValueRef x)
3638 {
3639 LLVMBuilderRef builder = bld->gallivm->builder;
3640 LLVMValueRef ipart;
3641 LLVMValueRef fpart;
3642
3643 assert(lp_check_value(bld->type, x));
3644
3645 assert(bld->type.floating);
3646
3647 /* ipart = floor(log2(x)) - 1 */
3648 ipart = lp_build_extract_exponent(bld, x, -1);
3649 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3650
3651 /* fpart = x / 2**ipart */
3652 fpart = lp_build_extract_mantissa(bld, x);
3653
3654 /* ipart + fpart */
3655 return LLVMBuildFAdd(builder, ipart, fpart, "");
3656 }
3657
3658
3659 /**
3660 * Fast implementation of iround(log2(x)).
3661 *
3662 * Not an approximation -- it should give accurate results all the time.
3663 */
3664 LLVMValueRef
3665 lp_build_ilog2(struct lp_build_context *bld,
3666 LLVMValueRef x)
3667 {
3668 LLVMBuilderRef builder = bld->gallivm->builder;
3669 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3670 LLVMValueRef ipart;
3671
3672 assert(bld->type.floating);
3673
3674 assert(lp_check_value(bld->type, x));
3675
3676 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3677 x = LLVMBuildFMul(builder, x, sqrt2, "");
3678
3679 /* ipart = floor(log2(x) + 0.5) */
3680 ipart = lp_build_extract_exponent(bld, x, 0);
3681
3682 return ipart;
3683 }
3684
3685 LLVMValueRef
3686 lp_build_mod(struct lp_build_context *bld,
3687 LLVMValueRef x,
3688 LLVMValueRef y)
3689 {
3690 LLVMBuilderRef builder = bld->gallivm->builder;
3691 LLVMValueRef res;
3692 const struct lp_type type = bld->type;
3693
3694 assert(lp_check_value(type, x));
3695 assert(lp_check_value(type, y));
3696
3697 if (type.floating)
3698 res = LLVMBuildFRem(builder, x, y, "");
3699 else if (type.sign)
3700 res = LLVMBuildSRem(builder, x, y, "");
3701 else
3702 res = LLVMBuildURem(builder, x, y, "");
3703 return res;
3704 }
3705
3706
3707 /*
3708 * For floating inputs it creates and returns a mask
3709 * which is all 1's for channels which are NaN.
3710 * Channels inside x which are not NaN will be 0.
3711 */
3712 LLVMValueRef
3713 lp_build_isnan(struct lp_build_context *bld,
3714 LLVMValueRef x)
3715 {
3716 LLVMValueRef mask;
3717 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3718
3719 assert(bld->type.floating);
3720 assert(lp_check_value(bld->type, x));
3721
3722 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3723 "isnotnan");
3724 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3725 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3726 return mask;
3727 }
3728
3729 /* Returns all 1's for floating point numbers that are
3730 * finite numbers and returns all zeros for -inf,
3731 * inf and nan's */
3732 LLVMValueRef
3733 lp_build_isfinite(struct lp_build_context *bld,
3734 LLVMValueRef x)
3735 {
3736 LLVMBuilderRef builder = bld->gallivm->builder;
3737 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3738 struct lp_type int_type = lp_int_type(bld->type);
3739 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3740 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3741 0x7f800000);
3742
3743 if (!bld->type.floating) {
3744 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3745 }
3746 assert(bld->type.floating);
3747 assert(lp_check_value(bld->type, x));
3748 assert(bld->type.width == 32);
3749
3750 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3751 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3752 intx, infornan32);
3753 }
3754
3755 /*
3756 * Returns true if the number is nan or inf and false otherwise.
3757 * The input has to be a floating point vector.
3758 */
3759 LLVMValueRef
3760 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3761 const struct lp_type type,
3762 LLVMValueRef x)
3763 {
3764 LLVMBuilderRef builder = gallivm->builder;
3765 struct lp_type int_type = lp_int_type(type);
3766 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3767 0x7f800000);
3768 LLVMValueRef ret;
3769
3770 assert(type.floating);
3771
3772 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3773 ret = LLVMBuildAnd(builder, ret, const0, "");
3774 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3775 ret, const0);
3776
3777 return ret;
3778 }
3779
3780
3781 LLVMValueRef
3782 lp_build_fpstate_get(struct gallivm_state *gallivm)
3783 {
3784 if (util_cpu_caps.has_sse) {
3785 LLVMBuilderRef builder = gallivm->builder;
3786 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3787 gallivm,
3788 LLVMInt32TypeInContext(gallivm->context),
3789 "mxcsr_ptr");
3790 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3791 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3792 lp_build_intrinsic(builder,
3793 "llvm.x86.sse.stmxcsr",
3794 LLVMVoidTypeInContext(gallivm->context),
3795 &mxcsr_ptr8, 1, 0);
3796 return mxcsr_ptr;
3797 }
3798 return 0;
3799 }
3800
3801 void
3802 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3803 boolean zero)
3804 {
3805 if (util_cpu_caps.has_sse) {
3806 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3807 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3808
3809 LLVMBuilderRef builder = gallivm->builder;
3810 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3811 LLVMValueRef mxcsr =
3812 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3813
3814 if (util_cpu_caps.has_daz) {
3815 /* Enable denormals are zero mode */
3816 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3817 }
3818 if (zero) {
3819 mxcsr = LLVMBuildOr(builder, mxcsr,
3820 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3821 } else {
3822 mxcsr = LLVMBuildAnd(builder, mxcsr,
3823 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3824 }
3825
3826 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3827 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3828 }
3829 }
3830
3831 void
3832 lp_build_fpstate_set(struct gallivm_state *gallivm,
3833 LLVMValueRef mxcsr_ptr)
3834 {
3835 if (util_cpu_caps.has_sse) {
3836 LLVMBuilderRef builder = gallivm->builder;
3837 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3838 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3839 lp_build_intrinsic(builder,
3840 "llvm.x86.sse.ldmxcsr",
3841 LLVMVoidTypeInContext(gallivm->context),
3842 &mxcsr_ptr, 1, 0);
3843 }
3844 }