f348833206bcc3c1ef2637ba70635069b027e4ce
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if (a == bld->zero)
545 return b;
546 if (b == bld->zero)
547 return a;
548 if (a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if (type.norm) {
552 const char *intrinsic = NULL;
553
554 if (!type.sign && (a == bld->one || b == bld->one))
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if (util_cpu_caps.has_sse2) {
560 if (type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
562 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
563 if (type.width == 16)
564 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
565 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
566 } else if (util_cpu_caps.has_altivec) {
567 if (type.width == 8)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
569 if (type.width == 16)
570 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
571 }
572 }
573 if (type.width * type.length == 256) {
574 if (util_cpu_caps.has_avx2) {
575 if (type.width == 8)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
577 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
578 if (type.width == 16)
579 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
580 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
581 }
582 }
583 }
584
585 if (intrinsic)
586 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
587 }
588
589 if(type.norm && !type.floating && !type.fixed) {
590 if (type.sign) {
591 uint64_t sign = (uint64_t)1 << (type.width - 1);
592 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
593 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
594 /* a_clamp_max is the maximum a for positive b,
595 a_clamp_min is the minimum a for negative b. */
596 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
598 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
599 }
600 }
601
602 if(LLVMIsConstant(a) && LLVMIsConstant(b))
603 if (type.floating)
604 res = LLVMConstFAdd(a, b);
605 else
606 res = LLVMConstAdd(a, b);
607 else
608 if (type.floating)
609 res = LLVMBuildFAdd(builder, a, b, "");
610 else
611 res = LLVMBuildAdd(builder, a, b, "");
612
613 /* clamp to ceiling of 1.0 */
614 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
615 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
616
617 if (type.norm && !type.floating && !type.fixed) {
618 if (!type.sign) {
619 /*
620 * newer llvm versions no longer support the intrinsics, but recognize
621 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
622 * code, it is important we match the pattern llvm uses (and pray llvm
623 * doesn't change it - and hope they decide on the same pattern for
624 * all backends supporting it...).
625 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
626 * interfere with llvm's ability to recognize the pattern but seems
627 * a bit brittle.
628 */
629 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
630 res = lp_build_select(bld, overflowed,
631 LLVMConstAllOnes(bld->int_vec_type), res);
632 }
633 }
634
635 /* XXX clamp to floor of -1 or 0??? */
636
637 return res;
638 }
639
640
641 /** Return the scalar sum of the elements of a.
642 * Should avoid this operation whenever possible.
643 */
644 LLVMValueRef
645 lp_build_horizontal_add(struct lp_build_context *bld,
646 LLVMValueRef a)
647 {
648 LLVMBuilderRef builder = bld->gallivm->builder;
649 const struct lp_type type = bld->type;
650 LLVMValueRef index, res;
651 unsigned i, length;
652 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
653 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
654 LLVMValueRef vecres, elem2;
655
656 assert(lp_check_value(type, a));
657
658 if (type.length == 1) {
659 return a;
660 }
661
662 assert(!bld->type.norm);
663
664 /*
665 * for byte vectors can do much better with psadbw.
666 * Using repeated shuffle/adds here. Note with multiple vectors
667 * this can be done more efficiently as outlined in the intel
668 * optimization manual.
669 * Note: could cause data rearrangement if used with smaller element
670 * sizes.
671 */
672
673 vecres = a;
674 length = type.length / 2;
675 while (length > 1) {
676 LLVMValueRef vec1, vec2;
677 for (i = 0; i < length; i++) {
678 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
679 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
680 }
681 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
682 LLVMConstVector(shuffles1, length), "");
683 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
684 LLVMConstVector(shuffles2, length), "");
685 if (type.floating) {
686 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
687 }
688 else {
689 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
690 }
691 length = length >> 1;
692 }
693
694 /* always have vector of size 2 here */
695 assert(length == 1);
696
697 index = lp_build_const_int32(bld->gallivm, 0);
698 res = LLVMBuildExtractElement(builder, vecres, index, "");
699 index = lp_build_const_int32(bld->gallivm, 1);
700 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
701
702 if (type.floating)
703 res = LLVMBuildFAdd(builder, res, elem2, "");
704 else
705 res = LLVMBuildAdd(builder, res, elem2, "");
706
707 return res;
708 }
709
710 /**
711 * Return the horizontal sums of 4 float vectors as a float4 vector.
712 * This uses the technique as outlined in Intel Optimization Manual.
713 */
714 static LLVMValueRef
715 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
716 LLVMValueRef src[4])
717 {
718 struct gallivm_state *gallivm = bld->gallivm;
719 LLVMBuilderRef builder = gallivm->builder;
720 LLVMValueRef shuffles[4];
721 LLVMValueRef tmp[4];
722 LLVMValueRef sumtmp[2], shuftmp[2];
723
724 /* lower half of regs */
725 shuffles[0] = lp_build_const_int32(gallivm, 0);
726 shuffles[1] = lp_build_const_int32(gallivm, 1);
727 shuffles[2] = lp_build_const_int32(gallivm, 4);
728 shuffles[3] = lp_build_const_int32(gallivm, 5);
729 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
730 LLVMConstVector(shuffles, 4), "");
731 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
732 LLVMConstVector(shuffles, 4), "");
733
734 /* upper half of regs */
735 shuffles[0] = lp_build_const_int32(gallivm, 2);
736 shuffles[1] = lp_build_const_int32(gallivm, 3);
737 shuffles[2] = lp_build_const_int32(gallivm, 6);
738 shuffles[3] = lp_build_const_int32(gallivm, 7);
739 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
740 LLVMConstVector(shuffles, 4), "");
741 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
742 LLVMConstVector(shuffles, 4), "");
743
744 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
745 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
746
747 shuffles[0] = lp_build_const_int32(gallivm, 0);
748 shuffles[1] = lp_build_const_int32(gallivm, 2);
749 shuffles[2] = lp_build_const_int32(gallivm, 4);
750 shuffles[3] = lp_build_const_int32(gallivm, 6);
751 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
752 LLVMConstVector(shuffles, 4), "");
753
754 shuffles[0] = lp_build_const_int32(gallivm, 1);
755 shuffles[1] = lp_build_const_int32(gallivm, 3);
756 shuffles[2] = lp_build_const_int32(gallivm, 5);
757 shuffles[3] = lp_build_const_int32(gallivm, 7);
758 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
759 LLVMConstVector(shuffles, 4), "");
760
761 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
762 }
763
764
765 /*
766 * partially horizontally add 2-4 float vectors with length nx4,
767 * i.e. only four adjacent values in each vector will be added,
768 * assuming values are really grouped in 4 which also determines
769 * output order.
770 *
771 * Return a vector of the same length as the initial vectors,
772 * with the excess elements (if any) being undefined.
773 * The element order is independent of number of input vectors.
774 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
775 * the output order thus will be
776 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
777 */
778 LLVMValueRef
779 lp_build_hadd_partial4(struct lp_build_context *bld,
780 LLVMValueRef vectors[],
781 unsigned num_vecs)
782 {
783 struct gallivm_state *gallivm = bld->gallivm;
784 LLVMBuilderRef builder = gallivm->builder;
785 LLVMValueRef ret_vec;
786 LLVMValueRef tmp[4];
787 const char *intrinsic = NULL;
788
789 assert(num_vecs >= 2 && num_vecs <= 4);
790 assert(bld->type.floating);
791
792 /* only use this with at least 2 vectors, as it is sort of expensive
793 * (depending on cpu) and we always need two horizontal adds anyway,
794 * so a shuffle/add approach might be better.
795 */
796
797 tmp[0] = vectors[0];
798 tmp[1] = vectors[1];
799
800 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
801 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
802
803 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
804 bld->type.length == 4) {
805 intrinsic = "llvm.x86.sse3.hadd.ps";
806 }
807 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
808 bld->type.length == 8) {
809 intrinsic = "llvm.x86.avx.hadd.ps.256";
810 }
811 if (intrinsic) {
812 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
813 lp_build_vec_type(gallivm, bld->type),
814 tmp[0], tmp[1]);
815 if (num_vecs > 2) {
816 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
817 lp_build_vec_type(gallivm, bld->type),
818 tmp[2], tmp[3]);
819 }
820 else {
821 tmp[1] = tmp[0];
822 }
823 return lp_build_intrinsic_binary(builder, intrinsic,
824 lp_build_vec_type(gallivm, bld->type),
825 tmp[0], tmp[1]);
826 }
827
828 if (bld->type.length == 4) {
829 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
830 }
831 else {
832 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
833 unsigned j;
834 unsigned num_iter = bld->type.length / 4;
835 struct lp_type parttype = bld->type;
836 parttype.length = 4;
837 for (j = 0; j < num_iter; j++) {
838 LLVMValueRef partsrc[4];
839 unsigned i;
840 for (i = 0; i < 4; i++) {
841 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
842 }
843 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
844 }
845 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
846 }
847 return ret_vec;
848 }
849
850 /**
851 * Generate a - b
852 */
853 LLVMValueRef
854 lp_build_sub(struct lp_build_context *bld,
855 LLVMValueRef a,
856 LLVMValueRef b)
857 {
858 LLVMBuilderRef builder = bld->gallivm->builder;
859 const struct lp_type type = bld->type;
860 LLVMValueRef res;
861
862 assert(lp_check_value(type, a));
863 assert(lp_check_value(type, b));
864
865 if (b == bld->zero)
866 return a;
867 if (a == bld->undef || b == bld->undef)
868 return bld->undef;
869 if (a == b)
870 return bld->zero;
871
872 if (type.norm) {
873 const char *intrinsic = NULL;
874
875 if (!type.sign && b == bld->one)
876 return bld->zero;
877
878 if (!type.floating && !type.fixed) {
879 if (type.width * type.length == 128) {
880 if (util_cpu_caps.has_sse2) {
881 if (type.width == 8)
882 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
883 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
884 if (type.width == 16)
885 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
886 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
887 } else if (util_cpu_caps.has_altivec) {
888 if (type.width == 8)
889 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
890 if (type.width == 16)
891 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
892 }
893 }
894 if (type.width * type.length == 256) {
895 if (util_cpu_caps.has_avx2) {
896 if (type.width == 8)
897 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
898 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
899 if (type.width == 16)
900 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
901 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
902 }
903 }
904 }
905
906 if (intrinsic)
907 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
908 }
909
910 if(type.norm && !type.floating && !type.fixed) {
911 if (type.sign) {
912 uint64_t sign = (uint64_t)1 << (type.width - 1);
913 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
914 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
915 /* a_clamp_max is the maximum a for negative b,
916 a_clamp_min is the minimum a for positive b. */
917 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
918 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
919 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
920 } else {
921 /*
922 * This must match llvm pattern for saturated unsigned sub.
923 * (lp_build_max_simple actually does the job with its current
924 * definition but do it explicitly here.)
925 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
926 * interfere with llvm's ability to recognize the pattern but seems
927 * a bit brittle.
928 */
929 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
930 a = lp_build_select(bld, no_ov, a, b);
931 }
932 }
933
934 if(LLVMIsConstant(a) && LLVMIsConstant(b))
935 if (type.floating)
936 res = LLVMConstFSub(a, b);
937 else
938 res = LLVMConstSub(a, b);
939 else
940 if (type.floating)
941 res = LLVMBuildFSub(builder, a, b, "");
942 else
943 res = LLVMBuildSub(builder, a, b, "");
944
945 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
946 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
947
948 return res;
949 }
950
951
952
953 /**
954 * Normalized multiplication.
955 *
956 * There are several approaches for (using 8-bit normalized multiplication as
957 * an example):
958 *
959 * - alpha plus one
960 *
961 * makes the following approximation to the division (Sree)
962 *
963 * a*b/255 ~= (a*(b + 1)) >> 256
964 *
965 * which is the fastest method that satisfies the following OpenGL criteria of
966 *
967 * 0*0 = 0 and 255*255 = 255
968 *
969 * - geometric series
970 *
971 * takes the geometric series approximation to the division
972 *
973 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
974 *
975 * in this case just the first two terms to fit in 16bit arithmetic
976 *
977 * t/255 ~= (t + (t >> 8)) >> 8
978 *
979 * note that just by itself it doesn't satisfies the OpenGL criteria, as
980 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
981 * must be used.
982 *
983 * - geometric series plus rounding
984 *
985 * when using a geometric series division instead of truncating the result
986 * use roundoff in the approximation (Jim Blinn)
987 *
988 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
989 *
990 * achieving the exact results.
991 *
992 *
993 *
994 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
995 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
996 * @sa Michael Herf, The "double blend trick", May 2000,
997 * http://www.stereopsis.com/doubleblend.html
998 */
999 LLVMValueRef
1000 lp_build_mul_norm(struct gallivm_state *gallivm,
1001 struct lp_type wide_type,
1002 LLVMValueRef a, LLVMValueRef b)
1003 {
1004 LLVMBuilderRef builder = gallivm->builder;
1005 struct lp_build_context bld;
1006 unsigned n;
1007 LLVMValueRef half;
1008 LLVMValueRef ab;
1009
1010 assert(!wide_type.floating);
1011 assert(lp_check_value(wide_type, a));
1012 assert(lp_check_value(wide_type, b));
1013
1014 lp_build_context_init(&bld, gallivm, wide_type);
1015
1016 n = wide_type.width / 2;
1017 if (wide_type.sign) {
1018 --n;
1019 }
1020
1021 /*
1022 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1023 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1024 */
1025
1026 /*
1027 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1028 */
1029
1030 ab = LLVMBuildMul(builder, a, b, "");
1031 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1032
1033 /*
1034 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1035 */
1036
1037 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1038 if (wide_type.sign) {
1039 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1040 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1041 half = lp_build_select(&bld, sign, minus_half, half);
1042 }
1043 ab = LLVMBuildAdd(builder, ab, half, "");
1044
1045 /* Final division */
1046 ab = lp_build_shr_imm(&bld, ab, n);
1047
1048 return ab;
1049 }
1050
1051 /**
1052 * Generate a * b
1053 */
1054 LLVMValueRef
1055 lp_build_mul(struct lp_build_context *bld,
1056 LLVMValueRef a,
1057 LLVMValueRef b)
1058 {
1059 LLVMBuilderRef builder = bld->gallivm->builder;
1060 const struct lp_type type = bld->type;
1061 LLVMValueRef shift;
1062 LLVMValueRef res;
1063
1064 assert(lp_check_value(type, a));
1065 assert(lp_check_value(type, b));
1066
1067 if(a == bld->zero)
1068 return bld->zero;
1069 if(a == bld->one)
1070 return b;
1071 if(b == bld->zero)
1072 return bld->zero;
1073 if(b == bld->one)
1074 return a;
1075 if(a == bld->undef || b == bld->undef)
1076 return bld->undef;
1077
1078 if (!type.floating && !type.fixed && type.norm) {
1079 struct lp_type wide_type = lp_wider_type(type);
1080 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1081
1082 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1083 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1084
1085 /* PMULLW, PSRLW, PADDW */
1086 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1087 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1088
1089 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1090
1091 return ab;
1092 }
1093
1094 if(type.fixed)
1095 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1096 else
1097 shift = NULL;
1098
1099 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1100 if (type.floating)
1101 res = LLVMConstFMul(a, b);
1102 else
1103 res = LLVMConstMul(a, b);
1104 if(shift) {
1105 if(type.sign)
1106 res = LLVMConstAShr(res, shift);
1107 else
1108 res = LLVMConstLShr(res, shift);
1109 }
1110 }
1111 else {
1112 if (type.floating)
1113 res = LLVMBuildFMul(builder, a, b, "");
1114 else
1115 res = LLVMBuildMul(builder, a, b, "");
1116 if(shift) {
1117 if(type.sign)
1118 res = LLVMBuildAShr(builder, res, shift, "");
1119 else
1120 res = LLVMBuildLShr(builder, res, shift, "");
1121 }
1122 }
1123
1124 return res;
1125 }
1126
1127 /*
1128 * Widening mul, valid for 32x32 bit -> 64bit only.
1129 * Result is low 32bits, high bits returned in res_hi.
1130 *
1131 * Emits code that is meant to be compiled for the host CPU.
1132 */
1133 LLVMValueRef
1134 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1135 LLVMValueRef a,
1136 LLVMValueRef b,
1137 LLVMValueRef *res_hi)
1138 {
1139 struct gallivm_state *gallivm = bld->gallivm;
1140 LLVMBuilderRef builder = gallivm->builder;
1141
1142 assert(bld->type.width == 32);
1143 assert(bld->type.floating == 0);
1144 assert(bld->type.fixed == 0);
1145 assert(bld->type.norm == 0);
1146
1147 /*
1148 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1149 * for x86 simd is atrocious (even if the high bits weren't required),
1150 * trying to handle real 64bit inputs (which of course can't happen due
1151 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1152 * apparently llvm does not recognize this widening mul). This includes 6
1153 * (instead of 2) pmuludq plus extra adds and shifts
1154 * The same story applies to signed mul, albeit fixing this requires sse41.
1155 * https://llvm.org/bugs/show_bug.cgi?id=30845
1156 * So, whip up our own code, albeit only for length 4 and 8 (which
1157 * should be good enough)...
1158 */
1159 if ((bld->type.length == 4 || bld->type.length == 8) &&
1160 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1161 util_cpu_caps.has_sse4_1)) {
1162 const char *intrinsic = NULL;
1163 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1164 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1165 struct lp_type type_wide = lp_wider_type(bld->type);
1166 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1167 unsigned i;
1168 for (i = 0; i < bld->type.length; i += 2) {
1169 shuf[i] = lp_build_const_int32(gallivm, i+1);
1170 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1171 }
1172 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1173 aeven = a;
1174 beven = b;
1175 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1176 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1177
1178 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1179 if (bld->type.sign) {
1180 intrinsic = "llvm.x86.avx2.pmul.dq";
1181 } else {
1182 intrinsic = "llvm.x86.avx2.pmulu.dq";
1183 }
1184 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1185 wider_type, aeven, beven);
1186 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1187 wider_type, aodd, bodd);
1188 }
1189 else {
1190 /* for consistent naming look elsewhere... */
1191 if (bld->type.sign) {
1192 intrinsic = "llvm.x86.sse41.pmuldq";
1193 } else {
1194 intrinsic = "llvm.x86.sse2.pmulu.dq";
1195 }
1196 /*
1197 * XXX If we only have AVX but not AVX2 this is a pain.
1198 * lp_build_intrinsic_binary_anylength() can't handle it
1199 * (due to src and dst type not being identical).
1200 */
1201 if (bld->type.length == 8) {
1202 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1203 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1204 LLVMValueRef muleven2[2], mulodd2[2];
1205 struct lp_type type_wide_half = type_wide;
1206 LLVMTypeRef wtype_half;
1207 type_wide_half.length = 2;
1208 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1209 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1210 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1211 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1212 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1213 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1214 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1215 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1216 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1217 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1218 wtype_half, aevenlo, bevenlo);
1219 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1220 wtype_half, aoddlo, boddlo);
1221 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1222 wtype_half, aevenhi, bevenhi);
1223 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1224 wtype_half, aoddhi, boddhi);
1225 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1226 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1227
1228 }
1229 else {
1230 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1231 wider_type, aeven, beven);
1232 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1233 wider_type, aodd, bodd);
1234 }
1235 }
1236 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1237 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1238
1239 for (i = 0; i < bld->type.length; i += 2) {
1240 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1241 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1242 }
1243 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1244 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1245
1246 for (i = 0; i < bld->type.length; i += 2) {
1247 shuf[i] = lp_build_const_int32(gallivm, i);
1248 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1249 }
1250 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1251 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1252 }
1253 else {
1254 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1255 }
1256 }
1257
1258
1259 /*
1260 * Widening mul, valid for 32x32 bit -> 64bit only.
1261 * Result is low 32bits, high bits returned in res_hi.
1262 *
1263 * Emits generic code.
1264 */
1265 LLVMValueRef
1266 lp_build_mul_32_lohi(struct lp_build_context *bld,
1267 LLVMValueRef a,
1268 LLVMValueRef b,
1269 LLVMValueRef *res_hi)
1270 {
1271 struct gallivm_state *gallivm = bld->gallivm;
1272 LLVMBuilderRef builder = gallivm->builder;
1273 LLVMValueRef tmp, shift, res_lo;
1274 struct lp_type type_tmp;
1275 LLVMTypeRef wide_type, narrow_type;
1276
1277 type_tmp = bld->type;
1278 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1279 type_tmp.width *= 2;
1280 wide_type = lp_build_vec_type(gallivm, type_tmp);
1281 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1282
1283 if (bld->type.sign) {
1284 a = LLVMBuildSExt(builder, a, wide_type, "");
1285 b = LLVMBuildSExt(builder, b, wide_type, "");
1286 } else {
1287 a = LLVMBuildZExt(builder, a, wide_type, "");
1288 b = LLVMBuildZExt(builder, b, wide_type, "");
1289 }
1290 tmp = LLVMBuildMul(builder, a, b, "");
1291
1292 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1293
1294 /* Since we truncate anyway, LShr and AShr are equivalent. */
1295 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1296 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1297
1298 return res_lo;
1299 }
1300
1301
1302 /* a * b + c */
1303 LLVMValueRef
1304 lp_build_mad(struct lp_build_context *bld,
1305 LLVMValueRef a,
1306 LLVMValueRef b,
1307 LLVMValueRef c)
1308 {
1309 const struct lp_type type = bld->type;
1310 if (type.floating) {
1311 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1312 } else {
1313 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1314 }
1315 }
1316
1317
1318 /**
1319 * Small vector x scale multiplication optimization.
1320 */
1321 LLVMValueRef
1322 lp_build_mul_imm(struct lp_build_context *bld,
1323 LLVMValueRef a,
1324 int b)
1325 {
1326 LLVMBuilderRef builder = bld->gallivm->builder;
1327 LLVMValueRef factor;
1328
1329 assert(lp_check_value(bld->type, a));
1330
1331 if(b == 0)
1332 return bld->zero;
1333
1334 if(b == 1)
1335 return a;
1336
1337 if(b == -1)
1338 return lp_build_negate(bld, a);
1339
1340 if(b == 2 && bld->type.floating)
1341 return lp_build_add(bld, a, a);
1342
1343 if(util_is_power_of_two_or_zero(b)) {
1344 unsigned shift = ffs(b) - 1;
1345
1346 if(bld->type.floating) {
1347 #if 0
1348 /*
1349 * Power of two multiplication by directly manipulating the exponent.
1350 *
1351 * XXX: This might not be always faster, it will introduce a small error
1352 * for multiplication by zero, and it will produce wrong results
1353 * for Inf and NaN.
1354 */
1355 unsigned mantissa = lp_mantissa(bld->type);
1356 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1357 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1358 a = LLVMBuildAdd(builder, a, factor, "");
1359 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1360 return a;
1361 #endif
1362 }
1363 else {
1364 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1365 return LLVMBuildShl(builder, a, factor, "");
1366 }
1367 }
1368
1369 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1370 return lp_build_mul(bld, a, factor);
1371 }
1372
1373
1374 /**
1375 * Generate a / b
1376 */
1377 LLVMValueRef
1378 lp_build_div(struct lp_build_context *bld,
1379 LLVMValueRef a,
1380 LLVMValueRef b)
1381 {
1382 LLVMBuilderRef builder = bld->gallivm->builder;
1383 const struct lp_type type = bld->type;
1384
1385 assert(lp_check_value(type, a));
1386 assert(lp_check_value(type, b));
1387
1388 if(a == bld->zero)
1389 return bld->zero;
1390 if(a == bld->one && type.floating)
1391 return lp_build_rcp(bld, b);
1392 if(b == bld->zero)
1393 return bld->undef;
1394 if(b == bld->one)
1395 return a;
1396 if(a == bld->undef || b == bld->undef)
1397 return bld->undef;
1398
1399 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1400 if (type.floating)
1401 return LLVMConstFDiv(a, b);
1402 else if (type.sign)
1403 return LLVMConstSDiv(a, b);
1404 else
1405 return LLVMConstUDiv(a, b);
1406 }
1407
1408 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1409 if(FALSE &&
1410 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1411 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1412 type.floating)
1413 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1414
1415 if (type.floating)
1416 return LLVMBuildFDiv(builder, a, b, "");
1417 else if (type.sign)
1418 return LLVMBuildSDiv(builder, a, b, "");
1419 else
1420 return LLVMBuildUDiv(builder, a, b, "");
1421 }
1422
1423
1424 /**
1425 * Linear interpolation helper.
1426 *
1427 * @param normalized whether we are interpolating normalized values,
1428 * encoded in normalized integers, twice as wide.
1429 *
1430 * @sa http://www.stereopsis.com/doubleblend.html
1431 */
1432 static inline LLVMValueRef
1433 lp_build_lerp_simple(struct lp_build_context *bld,
1434 LLVMValueRef x,
1435 LLVMValueRef v0,
1436 LLVMValueRef v1,
1437 unsigned flags)
1438 {
1439 unsigned half_width = bld->type.width/2;
1440 LLVMBuilderRef builder = bld->gallivm->builder;
1441 LLVMValueRef delta;
1442 LLVMValueRef res;
1443
1444 assert(lp_check_value(bld->type, x));
1445 assert(lp_check_value(bld->type, v0));
1446 assert(lp_check_value(bld->type, v1));
1447
1448 delta = lp_build_sub(bld, v1, v0);
1449
1450 if (bld->type.floating) {
1451 assert(flags == 0);
1452 return lp_build_mad(bld, x, delta, v0);
1453 }
1454
1455 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1456 if (!bld->type.sign) {
1457 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1458 /*
1459 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1460 * most-significant-bit to the lowest-significant-bit, so that
1461 * later we can just divide by 2**n instead of 2**n - 1.
1462 */
1463
1464 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1465 }
1466
1467 /* (x * delta) >> n */
1468 res = lp_build_mul(bld, x, delta);
1469 res = lp_build_shr_imm(bld, res, half_width);
1470 } else {
1471 /*
1472 * The rescaling trick above doesn't work for signed numbers, so
1473 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1474 * instead.
1475 */
1476 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1477 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1478 }
1479 } else {
1480 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1481 res = lp_build_mul(bld, x, delta);
1482 }
1483
1484 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1485 /*
1486 * At this point both res and v0 only use the lower half of the bits,
1487 * the rest is zero. Instead of add / mask, do add with half wide type.
1488 */
1489 struct lp_type narrow_type;
1490 struct lp_build_context narrow_bld;
1491
1492 memset(&narrow_type, 0, sizeof narrow_type);
1493 narrow_type.sign = bld->type.sign;
1494 narrow_type.width = bld->type.width/2;
1495 narrow_type.length = bld->type.length*2;
1496
1497 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1498 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1499 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1500 res = lp_build_add(&narrow_bld, v0, res);
1501 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1502 } else {
1503 res = lp_build_add(bld, v0, res);
1504
1505 if (bld->type.fixed) {
1506 /*
1507 * We need to mask out the high order bits when lerping 8bit
1508 * normalized colors stored on 16bits
1509 */
1510 /* XXX: This step is necessary for lerping 8bit colors stored on
1511 * 16bits, but it will be wrong for true fixed point use cases.
1512 * Basically we need a more powerful lp_type, capable of further
1513 * distinguishing the values interpretation from the value storage.
1514 */
1515 LLVMValueRef low_bits;
1516 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1517 res = LLVMBuildAnd(builder, res, low_bits, "");
1518 }
1519 }
1520
1521 return res;
1522 }
1523
1524
1525 /**
1526 * Linear interpolation.
1527 */
1528 LLVMValueRef
1529 lp_build_lerp(struct lp_build_context *bld,
1530 LLVMValueRef x,
1531 LLVMValueRef v0,
1532 LLVMValueRef v1,
1533 unsigned flags)
1534 {
1535 const struct lp_type type = bld->type;
1536 LLVMValueRef res;
1537
1538 assert(lp_check_value(type, x));
1539 assert(lp_check_value(type, v0));
1540 assert(lp_check_value(type, v1));
1541
1542 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1543
1544 if (type.norm) {
1545 struct lp_type wide_type;
1546 struct lp_build_context wide_bld;
1547 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1548
1549 assert(type.length >= 2);
1550
1551 /*
1552 * Create a wider integer type, enough to hold the
1553 * intermediate result of the multiplication.
1554 */
1555 memset(&wide_type, 0, sizeof wide_type);
1556 wide_type.sign = type.sign;
1557 wide_type.width = type.width*2;
1558 wide_type.length = type.length/2;
1559
1560 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1561
1562 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1563 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1564 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1565
1566 /*
1567 * Lerp both halves.
1568 */
1569
1570 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1571
1572 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1573 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1574
1575 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1576 } else {
1577 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1578 }
1579
1580 return res;
1581 }
1582
1583
1584 /**
1585 * Bilinear interpolation.
1586 *
1587 * Values indices are in v_{yx}.
1588 */
1589 LLVMValueRef
1590 lp_build_lerp_2d(struct lp_build_context *bld,
1591 LLVMValueRef x,
1592 LLVMValueRef y,
1593 LLVMValueRef v00,
1594 LLVMValueRef v01,
1595 LLVMValueRef v10,
1596 LLVMValueRef v11,
1597 unsigned flags)
1598 {
1599 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1600 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1601 return lp_build_lerp(bld, y, v0, v1, flags);
1602 }
1603
1604
1605 LLVMValueRef
1606 lp_build_lerp_3d(struct lp_build_context *bld,
1607 LLVMValueRef x,
1608 LLVMValueRef y,
1609 LLVMValueRef z,
1610 LLVMValueRef v000,
1611 LLVMValueRef v001,
1612 LLVMValueRef v010,
1613 LLVMValueRef v011,
1614 LLVMValueRef v100,
1615 LLVMValueRef v101,
1616 LLVMValueRef v110,
1617 LLVMValueRef v111,
1618 unsigned flags)
1619 {
1620 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1621 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1622 return lp_build_lerp(bld, z, v0, v1, flags);
1623 }
1624
1625
1626 /**
1627 * Generate min(a, b)
1628 * Do checks for special cases but not for nans.
1629 */
1630 LLVMValueRef
1631 lp_build_min(struct lp_build_context *bld,
1632 LLVMValueRef a,
1633 LLVMValueRef b)
1634 {
1635 assert(lp_check_value(bld->type, a));
1636 assert(lp_check_value(bld->type, b));
1637
1638 if(a == bld->undef || b == bld->undef)
1639 return bld->undef;
1640
1641 if(a == b)
1642 return a;
1643
1644 if (bld->type.norm) {
1645 if (!bld->type.sign) {
1646 if (a == bld->zero || b == bld->zero) {
1647 return bld->zero;
1648 }
1649 }
1650 if(a == bld->one)
1651 return b;
1652 if(b == bld->one)
1653 return a;
1654 }
1655
1656 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1657 }
1658
1659
1660 /**
1661 * Generate min(a, b)
1662 * NaN's are handled according to the behavior specified by the
1663 * nan_behavior argument.
1664 */
1665 LLVMValueRef
1666 lp_build_min_ext(struct lp_build_context *bld,
1667 LLVMValueRef a,
1668 LLVMValueRef b,
1669 enum gallivm_nan_behavior nan_behavior)
1670 {
1671 assert(lp_check_value(bld->type, a));
1672 assert(lp_check_value(bld->type, b));
1673
1674 if(a == bld->undef || b == bld->undef)
1675 return bld->undef;
1676
1677 if(a == b)
1678 return a;
1679
1680 if (bld->type.norm) {
1681 if (!bld->type.sign) {
1682 if (a == bld->zero || b == bld->zero) {
1683 return bld->zero;
1684 }
1685 }
1686 if(a == bld->one)
1687 return b;
1688 if(b == bld->one)
1689 return a;
1690 }
1691
1692 return lp_build_min_simple(bld, a, b, nan_behavior);
1693 }
1694
1695 /**
1696 * Generate max(a, b)
1697 * Do checks for special cases, but NaN behavior is undefined.
1698 */
1699 LLVMValueRef
1700 lp_build_max(struct lp_build_context *bld,
1701 LLVMValueRef a,
1702 LLVMValueRef b)
1703 {
1704 assert(lp_check_value(bld->type, a));
1705 assert(lp_check_value(bld->type, b));
1706
1707 if(a == bld->undef || b == bld->undef)
1708 return bld->undef;
1709
1710 if(a == b)
1711 return a;
1712
1713 if(bld->type.norm) {
1714 if(a == bld->one || b == bld->one)
1715 return bld->one;
1716 if (!bld->type.sign) {
1717 if (a == bld->zero) {
1718 return b;
1719 }
1720 if (b == bld->zero) {
1721 return a;
1722 }
1723 }
1724 }
1725
1726 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1727 }
1728
1729
1730 /**
1731 * Generate max(a, b)
1732 * Checks for special cases.
1733 * NaN's are handled according to the behavior specified by the
1734 * nan_behavior argument.
1735 */
1736 LLVMValueRef
1737 lp_build_max_ext(struct lp_build_context *bld,
1738 LLVMValueRef a,
1739 LLVMValueRef b,
1740 enum gallivm_nan_behavior nan_behavior)
1741 {
1742 assert(lp_check_value(bld->type, a));
1743 assert(lp_check_value(bld->type, b));
1744
1745 if(a == bld->undef || b == bld->undef)
1746 return bld->undef;
1747
1748 if(a == b)
1749 return a;
1750
1751 if(bld->type.norm) {
1752 if(a == bld->one || b == bld->one)
1753 return bld->one;
1754 if (!bld->type.sign) {
1755 if (a == bld->zero) {
1756 return b;
1757 }
1758 if (b == bld->zero) {
1759 return a;
1760 }
1761 }
1762 }
1763
1764 return lp_build_max_simple(bld, a, b, nan_behavior);
1765 }
1766
1767 /**
1768 * Generate clamp(a, min, max)
1769 * NaN behavior (for any of a, min, max) is undefined.
1770 * Do checks for special cases.
1771 */
1772 LLVMValueRef
1773 lp_build_clamp(struct lp_build_context *bld,
1774 LLVMValueRef a,
1775 LLVMValueRef min,
1776 LLVMValueRef max)
1777 {
1778 assert(lp_check_value(bld->type, a));
1779 assert(lp_check_value(bld->type, min));
1780 assert(lp_check_value(bld->type, max));
1781
1782 a = lp_build_min(bld, a, max);
1783 a = lp_build_max(bld, a, min);
1784 return a;
1785 }
1786
1787
1788 /**
1789 * Generate clamp(a, 0, 1)
1790 * A NaN will get converted to zero.
1791 */
1792 LLVMValueRef
1793 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1794 LLVMValueRef a)
1795 {
1796 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1797 a = lp_build_min(bld, a, bld->one);
1798 return a;
1799 }
1800
1801
1802 /**
1803 * Generate abs(a)
1804 */
1805 LLVMValueRef
1806 lp_build_abs(struct lp_build_context *bld,
1807 LLVMValueRef a)
1808 {
1809 LLVMBuilderRef builder = bld->gallivm->builder;
1810 const struct lp_type type = bld->type;
1811 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1812
1813 assert(lp_check_value(type, a));
1814
1815 if(!type.sign)
1816 return a;
1817
1818 if(type.floating) {
1819 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1820 /* Workaround llvm.org/PR27332 */
1821 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1822 unsigned long long absMask = ~(1ULL << (type.width - 1));
1823 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1824 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1825 a = LLVMBuildAnd(builder, a, mask, "");
1826 a = LLVMBuildBitCast(builder, a, vec_type, "");
1827 return a;
1828 } else {
1829 char intrinsic[32];
1830 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1831 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1832 }
1833 }
1834
1835 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1836 switch(type.width) {
1837 case 8:
1838 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1839 case 16:
1840 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1841 case 32:
1842 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1843 }
1844 }
1845 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1846 switch(type.width) {
1847 case 8:
1848 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1849 case 16:
1850 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1851 case 32:
1852 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1853 }
1854 }
1855
1856 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1857 a, LLVMBuildNeg(builder, a, ""));
1858 }
1859
1860
1861 LLVMValueRef
1862 lp_build_negate(struct lp_build_context *bld,
1863 LLVMValueRef a)
1864 {
1865 LLVMBuilderRef builder = bld->gallivm->builder;
1866
1867 assert(lp_check_value(bld->type, a));
1868
1869 if (bld->type.floating)
1870 a = LLVMBuildFNeg(builder, a, "");
1871 else
1872 a = LLVMBuildNeg(builder, a, "");
1873
1874 return a;
1875 }
1876
1877
1878 /** Return -1, 0 or +1 depending on the sign of a */
1879 LLVMValueRef
1880 lp_build_sgn(struct lp_build_context *bld,
1881 LLVMValueRef a)
1882 {
1883 LLVMBuilderRef builder = bld->gallivm->builder;
1884 const struct lp_type type = bld->type;
1885 LLVMValueRef cond;
1886 LLVMValueRef res;
1887
1888 assert(lp_check_value(type, a));
1889
1890 /* Handle non-zero case */
1891 if(!type.sign) {
1892 /* if not zero then sign must be positive */
1893 res = bld->one;
1894 }
1895 else if(type.floating) {
1896 LLVMTypeRef vec_type;
1897 LLVMTypeRef int_type;
1898 LLVMValueRef mask;
1899 LLVMValueRef sign;
1900 LLVMValueRef one;
1901 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1902
1903 int_type = lp_build_int_vec_type(bld->gallivm, type);
1904 vec_type = lp_build_vec_type(bld->gallivm, type);
1905 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1906
1907 /* Take the sign bit and add it to 1 constant */
1908 sign = LLVMBuildBitCast(builder, a, int_type, "");
1909 sign = LLVMBuildAnd(builder, sign, mask, "");
1910 one = LLVMConstBitCast(bld->one, int_type);
1911 res = LLVMBuildOr(builder, sign, one, "");
1912 res = LLVMBuildBitCast(builder, res, vec_type, "");
1913 }
1914 else
1915 {
1916 /* signed int/norm/fixed point */
1917 /* could use psign with sse3 and appropriate vectors here */
1918 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1919 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1920 res = lp_build_select(bld, cond, bld->one, minus_one);
1921 }
1922
1923 /* Handle zero */
1924 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1925 res = lp_build_select(bld, cond, bld->zero, res);
1926
1927 return res;
1928 }
1929
1930
1931 /**
1932 * Set the sign of float vector 'a' according to 'sign'.
1933 * If sign==0, return abs(a).
1934 * If sign==1, return -abs(a);
1935 * Other values for sign produce undefined results.
1936 */
1937 LLVMValueRef
1938 lp_build_set_sign(struct lp_build_context *bld,
1939 LLVMValueRef a, LLVMValueRef sign)
1940 {
1941 LLVMBuilderRef builder = bld->gallivm->builder;
1942 const struct lp_type type = bld->type;
1943 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1944 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1946 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1947 ~((unsigned long long) 1 << (type.width - 1)));
1948 LLVMValueRef val, res;
1949
1950 assert(type.floating);
1951 assert(lp_check_value(type, a));
1952
1953 /* val = reinterpret_cast<int>(a) */
1954 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1955 /* val = val & mask */
1956 val = LLVMBuildAnd(builder, val, mask, "");
1957 /* sign = sign << shift */
1958 sign = LLVMBuildShl(builder, sign, shift, "");
1959 /* res = val | sign */
1960 res = LLVMBuildOr(builder, val, sign, "");
1961 /* res = reinterpret_cast<float>(res) */
1962 res = LLVMBuildBitCast(builder, res, vec_type, "");
1963
1964 return res;
1965 }
1966
1967
1968 /**
1969 * Convert vector of (or scalar) int to vector of (or scalar) float.
1970 */
1971 LLVMValueRef
1972 lp_build_int_to_float(struct lp_build_context *bld,
1973 LLVMValueRef a)
1974 {
1975 LLVMBuilderRef builder = bld->gallivm->builder;
1976 const struct lp_type type = bld->type;
1977 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1978
1979 assert(type.floating);
1980
1981 return LLVMBuildSIToFP(builder, a, vec_type, "");
1982 }
1983
1984 static boolean
1985 arch_rounding_available(const struct lp_type type)
1986 {
1987 if ((util_cpu_caps.has_sse4_1 &&
1988 (type.length == 1 || type.width*type.length == 128)) ||
1989 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
1990 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
1991 return TRUE;
1992 else if ((util_cpu_caps.has_altivec &&
1993 (type.width == 32 && type.length == 4)))
1994 return TRUE;
1995
1996 return FALSE;
1997 }
1998
1999 enum lp_build_round_mode
2000 {
2001 LP_BUILD_ROUND_NEAREST = 0,
2002 LP_BUILD_ROUND_FLOOR = 1,
2003 LP_BUILD_ROUND_CEIL = 2,
2004 LP_BUILD_ROUND_TRUNCATE = 3
2005 };
2006
2007 static inline LLVMValueRef
2008 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2009 LLVMValueRef a)
2010 {
2011 LLVMBuilderRef builder = bld->gallivm->builder;
2012 const struct lp_type type = bld->type;
2013 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2014 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2015 const char *intrinsic;
2016 LLVMValueRef res;
2017
2018 assert(type.floating);
2019 /* using the double precision conversions is a bit more complicated */
2020 assert(type.width == 32);
2021
2022 assert(lp_check_value(type, a));
2023 assert(util_cpu_caps.has_sse2);
2024
2025 /* This is relying on MXCSR rounding mode, which should always be nearest. */
2026 if (type.length == 1) {
2027 LLVMTypeRef vec_type;
2028 LLVMValueRef undef;
2029 LLVMValueRef arg;
2030 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2031
2032 vec_type = LLVMVectorType(bld->elem_type, 4);
2033
2034 intrinsic = "llvm.x86.sse.cvtss2si";
2035
2036 undef = LLVMGetUndef(vec_type);
2037
2038 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2039
2040 res = lp_build_intrinsic_unary(builder, intrinsic,
2041 ret_type, arg);
2042 }
2043 else {
2044 if (type.width* type.length == 128) {
2045 intrinsic = "llvm.x86.sse2.cvtps2dq";
2046 }
2047 else {
2048 assert(type.width*type.length == 256);
2049 assert(util_cpu_caps.has_avx);
2050
2051 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2052 }
2053 res = lp_build_intrinsic_unary(builder, intrinsic,
2054 ret_type, a);
2055 }
2056
2057 return res;
2058 }
2059
2060
2061 /*
2062 */
2063 static inline LLVMValueRef
2064 lp_build_round_altivec(struct lp_build_context *bld,
2065 LLVMValueRef a,
2066 enum lp_build_round_mode mode)
2067 {
2068 LLVMBuilderRef builder = bld->gallivm->builder;
2069 const struct lp_type type = bld->type;
2070 const char *intrinsic = NULL;
2071
2072 assert(type.floating);
2073
2074 assert(lp_check_value(type, a));
2075 assert(util_cpu_caps.has_altivec);
2076
2077 (void)type;
2078
2079 switch (mode) {
2080 case LP_BUILD_ROUND_NEAREST:
2081 intrinsic = "llvm.ppc.altivec.vrfin";
2082 break;
2083 case LP_BUILD_ROUND_FLOOR:
2084 intrinsic = "llvm.ppc.altivec.vrfim";
2085 break;
2086 case LP_BUILD_ROUND_CEIL:
2087 intrinsic = "llvm.ppc.altivec.vrfip";
2088 break;
2089 case LP_BUILD_ROUND_TRUNCATE:
2090 intrinsic = "llvm.ppc.altivec.vrfiz";
2091 break;
2092 }
2093
2094 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2095 }
2096
2097 static inline LLVMValueRef
2098 lp_build_round_arch(struct lp_build_context *bld,
2099 LLVMValueRef a,
2100 enum lp_build_round_mode mode)
2101 {
2102 if (util_cpu_caps.has_sse4_1) {
2103 LLVMBuilderRef builder = bld->gallivm->builder;
2104 const struct lp_type type = bld->type;
2105 const char *intrinsic_root;
2106 char intrinsic[32];
2107
2108 assert(type.floating);
2109 assert(lp_check_value(type, a));
2110 (void)type;
2111
2112 switch (mode) {
2113 case LP_BUILD_ROUND_NEAREST:
2114 intrinsic_root = "llvm.nearbyint";
2115 break;
2116 case LP_BUILD_ROUND_FLOOR:
2117 intrinsic_root = "llvm.floor";
2118 break;
2119 case LP_BUILD_ROUND_CEIL:
2120 intrinsic_root = "llvm.ceil";
2121 break;
2122 case LP_BUILD_ROUND_TRUNCATE:
2123 intrinsic_root = "llvm.trunc";
2124 break;
2125 }
2126
2127 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2128 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2129 }
2130 else /* (util_cpu_caps.has_altivec) */
2131 return lp_build_round_altivec(bld, a, mode);
2132 }
2133
2134 /**
2135 * Return the integer part of a float (vector) value (== round toward zero).
2136 * The returned value is a float (vector).
2137 * Ex: trunc(-1.5) = -1.0
2138 */
2139 LLVMValueRef
2140 lp_build_trunc(struct lp_build_context *bld,
2141 LLVMValueRef a)
2142 {
2143 LLVMBuilderRef builder = bld->gallivm->builder;
2144 const struct lp_type type = bld->type;
2145
2146 assert(type.floating);
2147 assert(lp_check_value(type, a));
2148
2149 if (arch_rounding_available(type)) {
2150 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2151 }
2152 else {
2153 const struct lp_type type = bld->type;
2154 struct lp_type inttype;
2155 struct lp_build_context intbld;
2156 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2157 LLVMValueRef trunc, res, anosign, mask;
2158 LLVMTypeRef int_vec_type = bld->int_vec_type;
2159 LLVMTypeRef vec_type = bld->vec_type;
2160
2161 assert(type.width == 32); /* might want to handle doubles at some point */
2162
2163 inttype = type;
2164 inttype.floating = 0;
2165 lp_build_context_init(&intbld, bld->gallivm, inttype);
2166
2167 /* round by truncation */
2168 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2169 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2170
2171 /* mask out sign bit */
2172 anosign = lp_build_abs(bld, a);
2173 /*
2174 * mask out all values if anosign > 2^24
2175 * This should work both for large ints (all rounding is no-op for them
2176 * because such floats are always exact) as well as special cases like
2177 * NaNs, Infs (taking advantage of the fact they use max exponent).
2178 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2179 */
2180 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2181 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2182 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2183 return lp_build_select(bld, mask, a, res);
2184 }
2185 }
2186
2187
2188 /**
2189 * Return float (vector) rounded to nearest integer (vector). The returned
2190 * value is a float (vector).
2191 * Ex: round(0.9) = 1.0
2192 * Ex: round(-1.5) = -2.0
2193 */
2194 LLVMValueRef
2195 lp_build_round(struct lp_build_context *bld,
2196 LLVMValueRef a)
2197 {
2198 LLVMBuilderRef builder = bld->gallivm->builder;
2199 const struct lp_type type = bld->type;
2200
2201 assert(type.floating);
2202 assert(lp_check_value(type, a));
2203
2204 if (arch_rounding_available(type)) {
2205 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2206 }
2207 else {
2208 const struct lp_type type = bld->type;
2209 struct lp_type inttype;
2210 struct lp_build_context intbld;
2211 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2212 LLVMValueRef res, anosign, mask;
2213 LLVMTypeRef int_vec_type = bld->int_vec_type;
2214 LLVMTypeRef vec_type = bld->vec_type;
2215
2216 assert(type.width == 32); /* might want to handle doubles at some point */
2217
2218 inttype = type;
2219 inttype.floating = 0;
2220 lp_build_context_init(&intbld, bld->gallivm, inttype);
2221
2222 res = lp_build_iround(bld, a);
2223 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2224
2225 /* mask out sign bit */
2226 anosign = lp_build_abs(bld, a);
2227 /*
2228 * mask out all values if anosign > 2^24
2229 * This should work both for large ints (all rounding is no-op for them
2230 * because such floats are always exact) as well as special cases like
2231 * NaNs, Infs (taking advantage of the fact they use max exponent).
2232 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2233 */
2234 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2235 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2236 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2237 return lp_build_select(bld, mask, a, res);
2238 }
2239 }
2240
2241
2242 /**
2243 * Return floor of float (vector), result is a float (vector)
2244 * Ex: floor(1.1) = 1.0
2245 * Ex: floor(-1.1) = -2.0
2246 */
2247 LLVMValueRef
2248 lp_build_floor(struct lp_build_context *bld,
2249 LLVMValueRef a)
2250 {
2251 LLVMBuilderRef builder = bld->gallivm->builder;
2252 const struct lp_type type = bld->type;
2253
2254 assert(type.floating);
2255 assert(lp_check_value(type, a));
2256
2257 if (arch_rounding_available(type)) {
2258 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2259 }
2260 else {
2261 const struct lp_type type = bld->type;
2262 struct lp_type inttype;
2263 struct lp_build_context intbld;
2264 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2265 LLVMValueRef trunc, res, anosign, mask;
2266 LLVMTypeRef int_vec_type = bld->int_vec_type;
2267 LLVMTypeRef vec_type = bld->vec_type;
2268
2269 if (type.width != 32) {
2270 char intrinsic[32];
2271 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2272 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2273 }
2274
2275 assert(type.width == 32); /* might want to handle doubles at some point */
2276
2277 inttype = type;
2278 inttype.floating = 0;
2279 lp_build_context_init(&intbld, bld->gallivm, inttype);
2280
2281 /* round by truncation */
2282 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2283 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2284
2285 if (type.sign) {
2286 LLVMValueRef tmp;
2287
2288 /*
2289 * fix values if rounding is wrong (for non-special cases)
2290 * - this is the case if trunc > a
2291 */
2292 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2293 /* tmp = trunc > a ? 1.0 : 0.0 */
2294 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2295 tmp = lp_build_and(&intbld, mask, tmp);
2296 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2297 res = lp_build_sub(bld, res, tmp);
2298 }
2299
2300 /* mask out sign bit */
2301 anosign = lp_build_abs(bld, a);
2302 /*
2303 * mask out all values if anosign > 2^24
2304 * This should work both for large ints (all rounding is no-op for them
2305 * because such floats are always exact) as well as special cases like
2306 * NaNs, Infs (taking advantage of the fact they use max exponent).
2307 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2308 */
2309 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2310 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2311 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2312 return lp_build_select(bld, mask, a, res);
2313 }
2314 }
2315
2316
2317 /**
2318 * Return ceiling of float (vector), returning float (vector).
2319 * Ex: ceil( 1.1) = 2.0
2320 * Ex: ceil(-1.1) = -1.0
2321 */
2322 LLVMValueRef
2323 lp_build_ceil(struct lp_build_context *bld,
2324 LLVMValueRef a)
2325 {
2326 LLVMBuilderRef builder = bld->gallivm->builder;
2327 const struct lp_type type = bld->type;
2328
2329 assert(type.floating);
2330 assert(lp_check_value(type, a));
2331
2332 if (arch_rounding_available(type)) {
2333 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2334 }
2335 else {
2336 const struct lp_type type = bld->type;
2337 struct lp_type inttype;
2338 struct lp_build_context intbld;
2339 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2340 LLVMValueRef trunc, res, anosign, mask, tmp;
2341 LLVMTypeRef int_vec_type = bld->int_vec_type;
2342 LLVMTypeRef vec_type = bld->vec_type;
2343
2344 if (type.width != 32) {
2345 char intrinsic[32];
2346 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2347 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2348 }
2349
2350 assert(type.width == 32); /* might want to handle doubles at some point */
2351
2352 inttype = type;
2353 inttype.floating = 0;
2354 lp_build_context_init(&intbld, bld->gallivm, inttype);
2355
2356 /* round by truncation */
2357 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2358 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2359
2360 /*
2361 * fix values if rounding is wrong (for non-special cases)
2362 * - this is the case if trunc < a
2363 */
2364 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2365 /* tmp = trunc < a ? 1.0 : 0.0 */
2366 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2367 tmp = lp_build_and(&intbld, mask, tmp);
2368 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2369 res = lp_build_add(bld, trunc, tmp);
2370
2371 /* mask out sign bit */
2372 anosign = lp_build_abs(bld, a);
2373 /*
2374 * mask out all values if anosign > 2^24
2375 * This should work both for large ints (all rounding is no-op for them
2376 * because such floats are always exact) as well as special cases like
2377 * NaNs, Infs (taking advantage of the fact they use max exponent).
2378 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2379 */
2380 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2381 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2382 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2383 return lp_build_select(bld, mask, a, res);
2384 }
2385 }
2386
2387
2388 /**
2389 * Return fractional part of 'a' computed as a - floor(a)
2390 * Typically used in texture coord arithmetic.
2391 */
2392 LLVMValueRef
2393 lp_build_fract(struct lp_build_context *bld,
2394 LLVMValueRef a)
2395 {
2396 assert(bld->type.floating);
2397 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2398 }
2399
2400
2401 /**
2402 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2403 * against 0.99999(9). (Will also return that value for NaNs.)
2404 */
2405 static inline LLVMValueRef
2406 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2407 {
2408 LLVMValueRef max;
2409
2410 /* this is the largest number smaller than 1.0 representable as float */
2411 max = lp_build_const_vec(bld->gallivm, bld->type,
2412 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2413 return lp_build_min_ext(bld, fract, max,
2414 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2415 }
2416
2417
2418 /**
2419 * Same as lp_build_fract, but guarantees that the result is always smaller
2420 * than one. Will also return the smaller-than-one value for infs, NaNs.
2421 */
2422 LLVMValueRef
2423 lp_build_fract_safe(struct lp_build_context *bld,
2424 LLVMValueRef a)
2425 {
2426 return clamp_fract(bld, lp_build_fract(bld, a));
2427 }
2428
2429
2430 /**
2431 * Return the integer part of a float (vector) value (== round toward zero).
2432 * The returned value is an integer (vector).
2433 * Ex: itrunc(-1.5) = -1
2434 */
2435 LLVMValueRef
2436 lp_build_itrunc(struct lp_build_context *bld,
2437 LLVMValueRef a)
2438 {
2439 LLVMBuilderRef builder = bld->gallivm->builder;
2440 const struct lp_type type = bld->type;
2441 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2442
2443 assert(type.floating);
2444 assert(lp_check_value(type, a));
2445
2446 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2447 }
2448
2449
2450 /**
2451 * Return float (vector) rounded to nearest integer (vector). The returned
2452 * value is an integer (vector).
2453 * Ex: iround(0.9) = 1
2454 * Ex: iround(-1.5) = -2
2455 */
2456 LLVMValueRef
2457 lp_build_iround(struct lp_build_context *bld,
2458 LLVMValueRef a)
2459 {
2460 LLVMBuilderRef builder = bld->gallivm->builder;
2461 const struct lp_type type = bld->type;
2462 LLVMTypeRef int_vec_type = bld->int_vec_type;
2463 LLVMValueRef res;
2464
2465 assert(type.floating);
2466
2467 assert(lp_check_value(type, a));
2468
2469 if ((util_cpu_caps.has_sse2 &&
2470 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2471 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2472 return lp_build_iround_nearest_sse2(bld, a);
2473 }
2474 if (arch_rounding_available(type)) {
2475 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2476 }
2477 else {
2478 LLVMValueRef half;
2479
2480 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2481
2482 if (type.sign) {
2483 LLVMTypeRef vec_type = bld->vec_type;
2484 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2485 (unsigned long long)1 << (type.width - 1));
2486 LLVMValueRef sign;
2487
2488 /* get sign bit */
2489 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2490 sign = LLVMBuildAnd(builder, sign, mask, "");
2491
2492 /* sign * 0.5 */
2493 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2494 half = LLVMBuildOr(builder, sign, half, "");
2495 half = LLVMBuildBitCast(builder, half, vec_type, "");
2496 }
2497
2498 res = LLVMBuildFAdd(builder, a, half, "");
2499 }
2500
2501 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2502
2503 return res;
2504 }
2505
2506
2507 /**
2508 * Return floor of float (vector), result is an int (vector)
2509 * Ex: ifloor(1.1) = 1.0
2510 * Ex: ifloor(-1.1) = -2.0
2511 */
2512 LLVMValueRef
2513 lp_build_ifloor(struct lp_build_context *bld,
2514 LLVMValueRef a)
2515 {
2516 LLVMBuilderRef builder = bld->gallivm->builder;
2517 const struct lp_type type = bld->type;
2518 LLVMTypeRef int_vec_type = bld->int_vec_type;
2519 LLVMValueRef res;
2520
2521 assert(type.floating);
2522 assert(lp_check_value(type, a));
2523
2524 res = a;
2525 if (type.sign) {
2526 if (arch_rounding_available(type)) {
2527 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2528 }
2529 else {
2530 struct lp_type inttype;
2531 struct lp_build_context intbld;
2532 LLVMValueRef trunc, itrunc, mask;
2533
2534 assert(type.floating);
2535 assert(lp_check_value(type, a));
2536
2537 inttype = type;
2538 inttype.floating = 0;
2539 lp_build_context_init(&intbld, bld->gallivm, inttype);
2540
2541 /* round by truncation */
2542 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2543 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2544
2545 /*
2546 * fix values if rounding is wrong (for non-special cases)
2547 * - this is the case if trunc > a
2548 * The results of doing this with NaNs, very large values etc.
2549 * are undefined but this seems to be the case anyway.
2550 */
2551 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2552 /* cheapie minus one with mask since the mask is minus one / zero */
2553 return lp_build_add(&intbld, itrunc, mask);
2554 }
2555 }
2556
2557 /* round to nearest (toward zero) */
2558 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2559
2560 return res;
2561 }
2562
2563
2564 /**
2565 * Return ceiling of float (vector), returning int (vector).
2566 * Ex: iceil( 1.1) = 2
2567 * Ex: iceil(-1.1) = -1
2568 */
2569 LLVMValueRef
2570 lp_build_iceil(struct lp_build_context *bld,
2571 LLVMValueRef a)
2572 {
2573 LLVMBuilderRef builder = bld->gallivm->builder;
2574 const struct lp_type type = bld->type;
2575 LLVMTypeRef int_vec_type = bld->int_vec_type;
2576 LLVMValueRef res;
2577
2578 assert(type.floating);
2579 assert(lp_check_value(type, a));
2580
2581 if (arch_rounding_available(type)) {
2582 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2583 }
2584 else {
2585 struct lp_type inttype;
2586 struct lp_build_context intbld;
2587 LLVMValueRef trunc, itrunc, mask;
2588
2589 assert(type.floating);
2590 assert(lp_check_value(type, a));
2591
2592 inttype = type;
2593 inttype.floating = 0;
2594 lp_build_context_init(&intbld, bld->gallivm, inttype);
2595
2596 /* round by truncation */
2597 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2598 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2599
2600 /*
2601 * fix values if rounding is wrong (for non-special cases)
2602 * - this is the case if trunc < a
2603 * The results of doing this with NaNs, very large values etc.
2604 * are undefined but this seems to be the case anyway.
2605 */
2606 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2607 /* cheapie plus one with mask since the mask is minus one / zero */
2608 return lp_build_sub(&intbld, itrunc, mask);
2609 }
2610
2611 /* round to nearest (toward zero) */
2612 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2613
2614 return res;
2615 }
2616
2617
2618 /**
2619 * Combined ifloor() & fract().
2620 *
2621 * Preferred to calling the functions separately, as it will ensure that the
2622 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2623 */
2624 void
2625 lp_build_ifloor_fract(struct lp_build_context *bld,
2626 LLVMValueRef a,
2627 LLVMValueRef *out_ipart,
2628 LLVMValueRef *out_fpart)
2629 {
2630 LLVMBuilderRef builder = bld->gallivm->builder;
2631 const struct lp_type type = bld->type;
2632 LLVMValueRef ipart;
2633
2634 assert(type.floating);
2635 assert(lp_check_value(type, a));
2636
2637 if (arch_rounding_available(type)) {
2638 /*
2639 * floor() is easier.
2640 */
2641
2642 ipart = lp_build_floor(bld, a);
2643 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2644 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2645 }
2646 else {
2647 /*
2648 * ifloor() is easier.
2649 */
2650
2651 *out_ipart = lp_build_ifloor(bld, a);
2652 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2653 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2654 }
2655 }
2656
2657
2658 /**
2659 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2660 * always smaller than one.
2661 */
2662 void
2663 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2664 LLVMValueRef a,
2665 LLVMValueRef *out_ipart,
2666 LLVMValueRef *out_fpart)
2667 {
2668 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2669 *out_fpart = clamp_fract(bld, *out_fpart);
2670 }
2671
2672
2673 LLVMValueRef
2674 lp_build_sqrt(struct lp_build_context *bld,
2675 LLVMValueRef a)
2676 {
2677 LLVMBuilderRef builder = bld->gallivm->builder;
2678 const struct lp_type type = bld->type;
2679 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2680 char intrinsic[32];
2681
2682 assert(lp_check_value(type, a));
2683
2684 assert(type.floating);
2685 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2686
2687 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2688 }
2689
2690
2691 /**
2692 * Do one Newton-Raphson step to improve reciprocate precision:
2693 *
2694 * x_{i+1} = x_i * (2 - a * x_i)
2695 *
2696 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2697 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2698 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2699 * halo. It would be necessary to clamp the argument to prevent this.
2700 *
2701 * See also:
2702 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2703 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2704 */
2705 static inline LLVMValueRef
2706 lp_build_rcp_refine(struct lp_build_context *bld,
2707 LLVMValueRef a,
2708 LLVMValueRef rcp_a)
2709 {
2710 LLVMBuilderRef builder = bld->gallivm->builder;
2711 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2712 LLVMValueRef res;
2713
2714 res = LLVMBuildFMul(builder, a, rcp_a, "");
2715 res = LLVMBuildFSub(builder, two, res, "");
2716 res = LLVMBuildFMul(builder, rcp_a, res, "");
2717
2718 return res;
2719 }
2720
2721
2722 LLVMValueRef
2723 lp_build_rcp(struct lp_build_context *bld,
2724 LLVMValueRef a)
2725 {
2726 LLVMBuilderRef builder = bld->gallivm->builder;
2727 const struct lp_type type = bld->type;
2728
2729 assert(lp_check_value(type, a));
2730
2731 if(a == bld->zero)
2732 return bld->undef;
2733 if(a == bld->one)
2734 return bld->one;
2735 if(a == bld->undef)
2736 return bld->undef;
2737
2738 assert(type.floating);
2739
2740 if(LLVMIsConstant(a))
2741 return LLVMConstFDiv(bld->one, a);
2742
2743 /*
2744 * We don't use RCPPS because:
2745 * - it only has 10bits of precision
2746 * - it doesn't even get the reciprocate of 1.0 exactly
2747 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2748 * - for recent processors the benefit over DIVPS is marginal, a case
2749 * dependent
2750 *
2751 * We could still use it on certain processors if benchmarks show that the
2752 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2753 * particular uses that require less workarounds.
2754 */
2755
2756 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2757 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2758 const unsigned num_iterations = 0;
2759 LLVMValueRef res;
2760 unsigned i;
2761 const char *intrinsic = NULL;
2762
2763 if (type.length == 4) {
2764 intrinsic = "llvm.x86.sse.rcp.ps";
2765 }
2766 else {
2767 intrinsic = "llvm.x86.avx.rcp.ps.256";
2768 }
2769
2770 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2771
2772 for (i = 0; i < num_iterations; ++i) {
2773 res = lp_build_rcp_refine(bld, a, res);
2774 }
2775
2776 return res;
2777 }
2778
2779 return LLVMBuildFDiv(builder, bld->one, a, "");
2780 }
2781
2782
2783 /**
2784 * Do one Newton-Raphson step to improve rsqrt precision:
2785 *
2786 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2787 *
2788 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2789 */
2790 static inline LLVMValueRef
2791 lp_build_rsqrt_refine(struct lp_build_context *bld,
2792 LLVMValueRef a,
2793 LLVMValueRef rsqrt_a)
2794 {
2795 LLVMBuilderRef builder = bld->gallivm->builder;
2796 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2797 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2798 LLVMValueRef res;
2799
2800 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2801 res = LLVMBuildFMul(builder, a, res, "");
2802 res = LLVMBuildFSub(builder, three, res, "");
2803 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2804 res = LLVMBuildFMul(builder, half, res, "");
2805
2806 return res;
2807 }
2808
2809
2810 /**
2811 * Generate 1/sqrt(a).
2812 * Result is undefined for values < 0, infinity for +0.
2813 */
2814 LLVMValueRef
2815 lp_build_rsqrt(struct lp_build_context *bld,
2816 LLVMValueRef a)
2817 {
2818 const struct lp_type type = bld->type;
2819
2820 assert(lp_check_value(type, a));
2821
2822 assert(type.floating);
2823
2824 /*
2825 * This should be faster but all denormals will end up as infinity.
2826 */
2827 if (0 && lp_build_fast_rsqrt_available(type)) {
2828 const unsigned num_iterations = 1;
2829 LLVMValueRef res;
2830 unsigned i;
2831
2832 /* rsqrt(1.0) != 1.0 here */
2833 res = lp_build_fast_rsqrt(bld, a);
2834
2835 if (num_iterations) {
2836 /*
2837 * Newton-Raphson will result in NaN instead of infinity for zero,
2838 * and NaN instead of zero for infinity.
2839 * Also, need to ensure rsqrt(1.0) == 1.0.
2840 * All numbers smaller than FLT_MIN will result in +infinity
2841 * (rsqrtps treats all denormals as zero).
2842 */
2843 LLVMValueRef cmp;
2844 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2845 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2846
2847 for (i = 0; i < num_iterations; ++i) {
2848 res = lp_build_rsqrt_refine(bld, a, res);
2849 }
2850 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2851 res = lp_build_select(bld, cmp, inf, res);
2852 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2853 res = lp_build_select(bld, cmp, bld->zero, res);
2854 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2855 res = lp_build_select(bld, cmp, bld->one, res);
2856 }
2857
2858 return res;
2859 }
2860
2861 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2862 }
2863
2864 /**
2865 * If there's a fast (inaccurate) rsqrt instruction available
2866 * (caller may want to avoid to call rsqrt_fast if it's not available,
2867 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2868 * unavailable it would result in sqrt/div/mul so obviously
2869 * much better to just call sqrt, skipping both div and mul).
2870 */
2871 boolean
2872 lp_build_fast_rsqrt_available(struct lp_type type)
2873 {
2874 assert(type.floating);
2875
2876 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2877 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2878 return true;
2879 }
2880 return false;
2881 }
2882
2883
2884 /**
2885 * Generate 1/sqrt(a).
2886 * Result is undefined for values < 0, infinity for +0.
2887 * Precision is limited, only ~10 bits guaranteed
2888 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2889 */
2890 LLVMValueRef
2891 lp_build_fast_rsqrt(struct lp_build_context *bld,
2892 LLVMValueRef a)
2893 {
2894 LLVMBuilderRef builder = bld->gallivm->builder;
2895 const struct lp_type type = bld->type;
2896
2897 assert(lp_check_value(type, a));
2898
2899 if (lp_build_fast_rsqrt_available(type)) {
2900 const char *intrinsic = NULL;
2901
2902 if (type.length == 4) {
2903 intrinsic = "llvm.x86.sse.rsqrt.ps";
2904 }
2905 else {
2906 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2907 }
2908 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2909 }
2910 else {
2911 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2912 }
2913 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2914 }
2915
2916
2917 /**
2918 * Generate sin(a) or cos(a) using polynomial approximation.
2919 * TODO: it might be worth recognizing sin and cos using same source
2920 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2921 * would be way cheaper than calculating (nearly) everything twice...
2922 * Not sure it's common enough to be worth bothering however, scs
2923 * opcode could also benefit from calculating both though.
2924 */
2925 static LLVMValueRef
2926 lp_build_sin_or_cos(struct lp_build_context *bld,
2927 LLVMValueRef a,
2928 boolean cos)
2929 {
2930 struct gallivm_state *gallivm = bld->gallivm;
2931 LLVMBuilderRef b = gallivm->builder;
2932 struct lp_type int_type = lp_int_type(bld->type);
2933
2934 /*
2935 * take the absolute value,
2936 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2937 */
2938
2939 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2940 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2941
2942 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2943 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2944
2945 /*
2946 * scale by 4/Pi
2947 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2948 */
2949
2950 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2951 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2952
2953 /*
2954 * store the integer part of y in mm0
2955 * emm2 = _mm_cvttps_epi32(y);
2956 */
2957
2958 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2959
2960 /*
2961 * j=(j+1) & (~1) (see the cephes sources)
2962 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2963 */
2964
2965 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2966 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2967 /*
2968 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2969 */
2970 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2971 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2972
2973 /*
2974 * y = _mm_cvtepi32_ps(emm2);
2975 */
2976 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2977
2978 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2979 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2980 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2981 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2982
2983 /*
2984 * Argument used for poly selection and sign bit determination
2985 * is different for sin vs. cos.
2986 */
2987 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2988 emm2_and;
2989
2990 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2991 LLVMBuildNot(b, emm2_2, ""), ""),
2992 const_29, "sign_bit") :
2993 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2994 LLVMBuildShl(b, emm2_add,
2995 const_29, ""), ""),
2996 sign_mask, "sign_bit");
2997
2998 /*
2999 * get the polynom selection mask
3000 * there is one polynom for 0 <= x <= Pi/4
3001 * and another one for Pi/4<x<=Pi/2
3002 * Both branches will be computed.
3003 *
3004 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3005 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3006 */
3007
3008 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3009 LLVMValueRef poly_mask = lp_build_compare(gallivm,
3010 int_type, PIPE_FUNC_EQUAL,
3011 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3012
3013 /*
3014 * _PS_CONST(minus_cephes_DP1, -0.78515625);
3015 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3016 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3017 */
3018 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3019 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3020 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3021
3022 /*
3023 * The magic pass: "Extended precision modular arithmetic"
3024 * x = ((x - y * DP1) - y * DP2) - y * DP3;
3025 */
3026 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3027 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3028 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3029
3030 /*
3031 * Evaluate the first polynom (0 <= x <= Pi/4)
3032 *
3033 * z = _mm_mul_ps(x,x);
3034 */
3035 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3036
3037 /*
3038 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3039 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3040 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3041 */
3042 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3043 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3044 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3045
3046 /*
3047 * y = *(v4sf*)_ps_coscof_p0;
3048 * y = _mm_mul_ps(y, z);
3049 */
3050 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3051 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3052 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3053 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3054
3055
3056 /*
3057 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3058 * y = _mm_sub_ps(y, tmp);
3059 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3060 */
3061 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3062 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3063 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3064 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3065 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3066
3067 /*
3068 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3069 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3070 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3071 */
3072 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3073 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3074 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3075
3076 /*
3077 * Evaluate the second polynom (Pi/4 <= x <= 0)
3078 *
3079 * y2 = *(v4sf*)_ps_sincof_p0;
3080 * y2 = _mm_mul_ps(y2, z);
3081 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3082 * y2 = _mm_mul_ps(y2, z);
3083 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3084 * y2 = _mm_mul_ps(y2, z);
3085 * y2 = _mm_mul_ps(y2, x);
3086 * y2 = _mm_add_ps(y2, x);
3087 */
3088
3089 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3090 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3091 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3092 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3093
3094 /*
3095 * select the correct result from the two polynoms
3096 * xmm3 = poly_mask;
3097 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3098 * y = _mm_andnot_ps(xmm3, y);
3099 * y = _mm_or_ps(y,y2);
3100 */
3101 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3102 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3103 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3104 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3105 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3106 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3107
3108 /*
3109 * update the sign
3110 * y = _mm_xor_ps(y, sign_bit);
3111 */
3112 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3113 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3114
3115 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3116
3117 /* clamp output to be within [-1, 1] */
3118 y_result = lp_build_clamp(bld, y_result,
3119 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3120 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3121 /* If a is -inf, inf or NaN then return NaN */
3122 y_result = lp_build_select(bld, isfinite, y_result,
3123 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3124 return y_result;
3125 }
3126
3127
3128 /**
3129 * Generate sin(a)
3130 */
3131 LLVMValueRef
3132 lp_build_sin(struct lp_build_context *bld,
3133 LLVMValueRef a)
3134 {
3135 return lp_build_sin_or_cos(bld, a, FALSE);
3136 }
3137
3138
3139 /**
3140 * Generate cos(a)
3141 */
3142 LLVMValueRef
3143 lp_build_cos(struct lp_build_context *bld,
3144 LLVMValueRef a)
3145 {
3146 return lp_build_sin_or_cos(bld, a, TRUE);
3147 }
3148
3149
3150 /**
3151 * Generate pow(x, y)
3152 */
3153 LLVMValueRef
3154 lp_build_pow(struct lp_build_context *bld,
3155 LLVMValueRef x,
3156 LLVMValueRef y)
3157 {
3158 /* TODO: optimize the constant case */
3159 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3160 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3161 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3162 __FUNCTION__);
3163 }
3164
3165 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3166 }
3167
3168
3169 /**
3170 * Generate exp(x)
3171 */
3172 LLVMValueRef
3173 lp_build_exp(struct lp_build_context *bld,
3174 LLVMValueRef x)
3175 {
3176 /* log2(e) = 1/log(2) */
3177 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3178 1.4426950408889634);
3179
3180 assert(lp_check_value(bld->type, x));
3181
3182 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3183 }
3184
3185
3186 /**
3187 * Generate log(x)
3188 * Behavior is undefined with infs, 0s and nans
3189 */
3190 LLVMValueRef
3191 lp_build_log(struct lp_build_context *bld,
3192 LLVMValueRef x)
3193 {
3194 /* log(2) */
3195 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3196 0.69314718055994529);
3197
3198 assert(lp_check_value(bld->type, x));
3199
3200 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3201 }
3202
3203 /**
3204 * Generate log(x) that handles edge cases (infs, 0s and nans)
3205 */
3206 LLVMValueRef
3207 lp_build_log_safe(struct lp_build_context *bld,
3208 LLVMValueRef x)
3209 {
3210 /* log(2) */
3211 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3212 0.69314718055994529);
3213
3214 assert(lp_check_value(bld->type, x));
3215
3216 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3217 }
3218
3219
3220 /**
3221 * Generate polynomial.
3222 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3223 */
3224 LLVMValueRef
3225 lp_build_polynomial(struct lp_build_context *bld,
3226 LLVMValueRef x,
3227 const double *coeffs,
3228 unsigned num_coeffs)
3229 {
3230 const struct lp_type type = bld->type;
3231 LLVMValueRef even = NULL, odd = NULL;
3232 LLVMValueRef x2;
3233 unsigned i;
3234
3235 assert(lp_check_value(bld->type, x));
3236
3237 /* TODO: optimize the constant case */
3238 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3239 LLVMIsConstant(x)) {
3240 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3241 __FUNCTION__);
3242 }
3243
3244 /*
3245 * Calculate odd and even terms seperately to decrease data dependency
3246 * Ex:
3247 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3248 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3249 */
3250 x2 = lp_build_mul(bld, x, x);
3251
3252 for (i = num_coeffs; i--; ) {
3253 LLVMValueRef coeff;
3254
3255 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3256
3257 if (i % 2 == 0) {
3258 if (even)
3259 even = lp_build_mad(bld, x2, even, coeff);
3260 else
3261 even = coeff;
3262 } else {
3263 if (odd)
3264 odd = lp_build_mad(bld, x2, odd, coeff);
3265 else
3266 odd = coeff;
3267 }
3268 }
3269
3270 if (odd)
3271 return lp_build_mad(bld, odd, x, even);
3272 else if (even)
3273 return even;
3274 else
3275 return bld->undef;
3276 }
3277
3278
3279 /**
3280 * Minimax polynomial fit of 2**x, in range [0, 1[
3281 */
3282 const double lp_build_exp2_polynomial[] = {
3283 #if EXP_POLY_DEGREE == 5
3284 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3285 0.693153073200168932794,
3286 0.240153617044375388211,
3287 0.0558263180532956664775,
3288 0.00898934009049466391101,
3289 0.00187757667519147912699
3290 #elif EXP_POLY_DEGREE == 4
3291 1.00000259337069434683,
3292 0.693003834469974940458,
3293 0.24144275689150793076,
3294 0.0520114606103070150235,
3295 0.0135341679161270268764
3296 #elif EXP_POLY_DEGREE == 3
3297 0.999925218562710312959,
3298 0.695833540494823811697,
3299 0.226067155427249155588,
3300 0.0780245226406372992967
3301 #elif EXP_POLY_DEGREE == 2
3302 1.00172476321474503578,
3303 0.657636275736077639316,
3304 0.33718943461968720704
3305 #else
3306 #error
3307 #endif
3308 };
3309
3310
3311 LLVMValueRef
3312 lp_build_exp2(struct lp_build_context *bld,
3313 LLVMValueRef x)
3314 {
3315 LLVMBuilderRef builder = bld->gallivm->builder;
3316 const struct lp_type type = bld->type;
3317 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3318 LLVMValueRef ipart = NULL;
3319 LLVMValueRef fpart = NULL;
3320 LLVMValueRef expipart = NULL;
3321 LLVMValueRef expfpart = NULL;
3322 LLVMValueRef res = NULL;
3323
3324 assert(lp_check_value(bld->type, x));
3325
3326 /* TODO: optimize the constant case */
3327 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3328 LLVMIsConstant(x)) {
3329 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3330 __FUNCTION__);
3331 }
3332
3333 assert(type.floating && type.width == 32);
3334
3335 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3336 * the result is INF and if it's smaller than -126.9 the result is 0 */
3337 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3338 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3339 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3340 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3341
3342 /* ipart = floor(x) */
3343 /* fpart = x - ipart */
3344 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3345
3346 /* expipart = (float) (1 << ipart) */
3347 expipart = LLVMBuildAdd(builder, ipart,
3348 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3349 expipart = LLVMBuildShl(builder, expipart,
3350 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3351 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3352
3353 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3354 ARRAY_SIZE(lp_build_exp2_polynomial));
3355
3356 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3357
3358 return res;
3359 }
3360
3361
3362
3363 /**
3364 * Extract the exponent of a IEEE-754 floating point value.
3365 *
3366 * Optionally apply an integer bias.
3367 *
3368 * Result is an integer value with
3369 *
3370 * ifloor(log2(x)) + bias
3371 */
3372 LLVMValueRef
3373 lp_build_extract_exponent(struct lp_build_context *bld,
3374 LLVMValueRef x,
3375 int bias)
3376 {
3377 LLVMBuilderRef builder = bld->gallivm->builder;
3378 const struct lp_type type = bld->type;
3379 unsigned mantissa = lp_mantissa(type);
3380 LLVMValueRef res;
3381
3382 assert(type.floating);
3383
3384 assert(lp_check_value(bld->type, x));
3385
3386 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3387
3388 res = LLVMBuildLShr(builder, x,
3389 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3390 res = LLVMBuildAnd(builder, res,
3391 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3392 res = LLVMBuildSub(builder, res,
3393 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3394
3395 return res;
3396 }
3397
3398
3399 /**
3400 * Extract the mantissa of the a floating.
3401 *
3402 * Result is a floating point value with
3403 *
3404 * x / floor(log2(x))
3405 */
3406 LLVMValueRef
3407 lp_build_extract_mantissa(struct lp_build_context *bld,
3408 LLVMValueRef x)
3409 {
3410 LLVMBuilderRef builder = bld->gallivm->builder;
3411 const struct lp_type type = bld->type;
3412 unsigned mantissa = lp_mantissa(type);
3413 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3414 (1ULL << mantissa) - 1);
3415 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3416 LLVMValueRef res;
3417
3418 assert(lp_check_value(bld->type, x));
3419
3420 assert(type.floating);
3421
3422 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3423
3424 /* res = x / 2**ipart */
3425 res = LLVMBuildAnd(builder, x, mantmask, "");
3426 res = LLVMBuildOr(builder, res, one, "");
3427 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3428
3429 return res;
3430 }
3431
3432
3433
3434 /**
3435 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3436 * These coefficients can be generate with
3437 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3438 */
3439 const double lp_build_log2_polynomial[] = {
3440 #if LOG_POLY_DEGREE == 5
3441 2.88539008148777786488L,
3442 0.961796878841293367824L,
3443 0.577058946784739859012L,
3444 0.412914355135828735411L,
3445 0.308591899232910175289L,
3446 0.352376952300281371868L,
3447 #elif LOG_POLY_DEGREE == 4
3448 2.88539009343309178325L,
3449 0.961791550404184197881L,
3450 0.577440339438736392009L,
3451 0.403343858251329912514L,
3452 0.406718052498846252698L,
3453 #elif LOG_POLY_DEGREE == 3
3454 2.88538959748872753838L,
3455 0.961932915889597772928L,
3456 0.571118517972136195241L,
3457 0.493997535084709500285L,
3458 #else
3459 #error
3460 #endif
3461 };
3462
3463 /**
3464 * See http://www.devmaster.net/forums/showthread.php?p=43580
3465 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3466 * http://www.nezumi.demon.co.uk/consult/logx.htm
3467 *
3468 * If handle_edge_cases is true the function will perform computations
3469 * to match the required D3D10+ behavior for each of the edge cases.
3470 * That means that if input is:
3471 * - less than zero (to and including -inf) then NaN will be returned
3472 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3473 * - +infinity, then +infinity will be returned
3474 * - NaN, then NaN will be returned
3475 *
3476 * Those checks are fairly expensive so if you don't need them make sure
3477 * handle_edge_cases is false.
3478 */
3479 void
3480 lp_build_log2_approx(struct lp_build_context *bld,
3481 LLVMValueRef x,
3482 LLVMValueRef *p_exp,
3483 LLVMValueRef *p_floor_log2,
3484 LLVMValueRef *p_log2,
3485 boolean handle_edge_cases)
3486 {
3487 LLVMBuilderRef builder = bld->gallivm->builder;
3488 const struct lp_type type = bld->type;
3489 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3490 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3491
3492 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3493 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3494 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3495
3496 LLVMValueRef i = NULL;
3497 LLVMValueRef y = NULL;
3498 LLVMValueRef z = NULL;
3499 LLVMValueRef exp = NULL;
3500 LLVMValueRef mant = NULL;
3501 LLVMValueRef logexp = NULL;
3502 LLVMValueRef p_z = NULL;
3503 LLVMValueRef res = NULL;
3504
3505 assert(lp_check_value(bld->type, x));
3506
3507 if(p_exp || p_floor_log2 || p_log2) {
3508 /* TODO: optimize the constant case */
3509 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3510 LLVMIsConstant(x)) {
3511 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3512 __FUNCTION__);
3513 }
3514
3515 assert(type.floating && type.width == 32);
3516
3517 /*
3518 * We don't explicitly handle denormalized numbers. They will yield a
3519 * result in the neighbourhood of -127, which appears to be adequate
3520 * enough.
3521 */
3522
3523 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3524
3525 /* exp = (float) exponent(x) */
3526 exp = LLVMBuildAnd(builder, i, expmask, "");
3527 }
3528
3529 if(p_floor_log2 || p_log2) {
3530 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3531 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3532 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3533 }
3534
3535 if (p_log2) {
3536 /* mant = 1 + (float) mantissa(x) */
3537 mant = LLVMBuildAnd(builder, i, mantmask, "");
3538 mant = LLVMBuildOr(builder, mant, one, "");
3539 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3540
3541 /* y = (mant - 1) / (mant + 1) */
3542 y = lp_build_div(bld,
3543 lp_build_sub(bld, mant, bld->one),
3544 lp_build_add(bld, mant, bld->one)
3545 );
3546
3547 /* z = y^2 */
3548 z = lp_build_mul(bld, y, y);
3549
3550 /* compute P(z) */
3551 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3552 ARRAY_SIZE(lp_build_log2_polynomial));
3553
3554 /* y * P(z) + logexp */
3555 res = lp_build_mad(bld, y, p_z, logexp);
3556
3557 if (type.floating && handle_edge_cases) {
3558 LLVMValueRef negmask, infmask, zmask;
3559 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3560 lp_build_const_vec(bld->gallivm, type, 0.0f));
3561 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3562 lp_build_const_vec(bld->gallivm, type, 0.0f));
3563 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3564 lp_build_const_vec(bld->gallivm, type, INFINITY));
3565
3566 /* If x is qual to inf make sure we return inf */
3567 res = lp_build_select(bld, infmask,
3568 lp_build_const_vec(bld->gallivm, type, INFINITY),
3569 res);
3570 /* If x is qual to 0, return -inf */
3571 res = lp_build_select(bld, zmask,
3572 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3573 res);
3574 /* If x is nan or less than 0, return nan */
3575 res = lp_build_select(bld, negmask,
3576 lp_build_const_vec(bld->gallivm, type, NAN),
3577 res);
3578 }
3579 }
3580
3581 if (p_exp) {
3582 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3583 *p_exp = exp;
3584 }
3585
3586 if (p_floor_log2)
3587 *p_floor_log2 = logexp;
3588
3589 if (p_log2)
3590 *p_log2 = res;
3591 }
3592
3593
3594 /*
3595 * log2 implementation which doesn't have special code to
3596 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3597 * the results for those cases are undefined.
3598 */
3599 LLVMValueRef
3600 lp_build_log2(struct lp_build_context *bld,
3601 LLVMValueRef x)
3602 {
3603 LLVMValueRef res;
3604 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3605 return res;
3606 }
3607
3608 /*
3609 * Version of log2 which handles all edge cases.
3610 * Look at documentation of lp_build_log2_approx for
3611 * description of the behavior for each of the edge cases.
3612 */
3613 LLVMValueRef
3614 lp_build_log2_safe(struct lp_build_context *bld,
3615 LLVMValueRef x)
3616 {
3617 LLVMValueRef res;
3618 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3619 return res;
3620 }
3621
3622
3623 /**
3624 * Faster (and less accurate) log2.
3625 *
3626 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3627 *
3628 * Piece-wise linear approximation, with exact results when x is a
3629 * power of two.
3630 *
3631 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3632 */
3633 LLVMValueRef
3634 lp_build_fast_log2(struct lp_build_context *bld,
3635 LLVMValueRef x)
3636 {
3637 LLVMBuilderRef builder = bld->gallivm->builder;
3638 LLVMValueRef ipart;
3639 LLVMValueRef fpart;
3640
3641 assert(lp_check_value(bld->type, x));
3642
3643 assert(bld->type.floating);
3644
3645 /* ipart = floor(log2(x)) - 1 */
3646 ipart = lp_build_extract_exponent(bld, x, -1);
3647 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3648
3649 /* fpart = x / 2**ipart */
3650 fpart = lp_build_extract_mantissa(bld, x);
3651
3652 /* ipart + fpart */
3653 return LLVMBuildFAdd(builder, ipart, fpart, "");
3654 }
3655
3656
3657 /**
3658 * Fast implementation of iround(log2(x)).
3659 *
3660 * Not an approximation -- it should give accurate results all the time.
3661 */
3662 LLVMValueRef
3663 lp_build_ilog2(struct lp_build_context *bld,
3664 LLVMValueRef x)
3665 {
3666 LLVMBuilderRef builder = bld->gallivm->builder;
3667 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3668 LLVMValueRef ipart;
3669
3670 assert(bld->type.floating);
3671
3672 assert(lp_check_value(bld->type, x));
3673
3674 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3675 x = LLVMBuildFMul(builder, x, sqrt2, "");
3676
3677 /* ipart = floor(log2(x) + 0.5) */
3678 ipart = lp_build_extract_exponent(bld, x, 0);
3679
3680 return ipart;
3681 }
3682
3683 LLVMValueRef
3684 lp_build_mod(struct lp_build_context *bld,
3685 LLVMValueRef x,
3686 LLVMValueRef y)
3687 {
3688 LLVMBuilderRef builder = bld->gallivm->builder;
3689 LLVMValueRef res;
3690 const struct lp_type type = bld->type;
3691
3692 assert(lp_check_value(type, x));
3693 assert(lp_check_value(type, y));
3694
3695 if (type.floating)
3696 res = LLVMBuildFRem(builder, x, y, "");
3697 else if (type.sign)
3698 res = LLVMBuildSRem(builder, x, y, "");
3699 else
3700 res = LLVMBuildURem(builder, x, y, "");
3701 return res;
3702 }
3703
3704
3705 /*
3706 * For floating inputs it creates and returns a mask
3707 * which is all 1's for channels which are NaN.
3708 * Channels inside x which are not NaN will be 0.
3709 */
3710 LLVMValueRef
3711 lp_build_isnan(struct lp_build_context *bld,
3712 LLVMValueRef x)
3713 {
3714 LLVMValueRef mask;
3715 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3716
3717 assert(bld->type.floating);
3718 assert(lp_check_value(bld->type, x));
3719
3720 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3721 "isnotnan");
3722 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3723 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3724 return mask;
3725 }
3726
3727 /* Returns all 1's for floating point numbers that are
3728 * finite numbers and returns all zeros for -inf,
3729 * inf and nan's */
3730 LLVMValueRef
3731 lp_build_isfinite(struct lp_build_context *bld,
3732 LLVMValueRef x)
3733 {
3734 LLVMBuilderRef builder = bld->gallivm->builder;
3735 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3736 struct lp_type int_type = lp_int_type(bld->type);
3737 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3738 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3739 0x7f800000);
3740
3741 if (!bld->type.floating) {
3742 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3743 }
3744 assert(bld->type.floating);
3745 assert(lp_check_value(bld->type, x));
3746 assert(bld->type.width == 32);
3747
3748 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3749 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3750 intx, infornan32);
3751 }
3752
3753 /*
3754 * Returns true if the number is nan or inf and false otherwise.
3755 * The input has to be a floating point vector.
3756 */
3757 LLVMValueRef
3758 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3759 const struct lp_type type,
3760 LLVMValueRef x)
3761 {
3762 LLVMBuilderRef builder = gallivm->builder;
3763 struct lp_type int_type = lp_int_type(type);
3764 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3765 0x7f800000);
3766 LLVMValueRef ret;
3767
3768 assert(type.floating);
3769
3770 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3771 ret = LLVMBuildAnd(builder, ret, const0, "");
3772 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3773 ret, const0);
3774
3775 return ret;
3776 }
3777
3778
3779 LLVMValueRef
3780 lp_build_fpstate_get(struct gallivm_state *gallivm)
3781 {
3782 if (util_cpu_caps.has_sse) {
3783 LLVMBuilderRef builder = gallivm->builder;
3784 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3785 gallivm,
3786 LLVMInt32TypeInContext(gallivm->context),
3787 "mxcsr_ptr");
3788 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3789 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3790 lp_build_intrinsic(builder,
3791 "llvm.x86.sse.stmxcsr",
3792 LLVMVoidTypeInContext(gallivm->context),
3793 &mxcsr_ptr8, 1, 0);
3794 return mxcsr_ptr;
3795 }
3796 return 0;
3797 }
3798
3799 void
3800 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3801 boolean zero)
3802 {
3803 if (util_cpu_caps.has_sse) {
3804 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3805 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3806
3807 LLVMBuilderRef builder = gallivm->builder;
3808 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3809 LLVMValueRef mxcsr =
3810 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3811
3812 if (util_cpu_caps.has_daz) {
3813 /* Enable denormals are zero mode */
3814 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3815 }
3816 if (zero) {
3817 mxcsr = LLVMBuildOr(builder, mxcsr,
3818 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3819 } else {
3820 mxcsr = LLVMBuildAnd(builder, mxcsr,
3821 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3822 }
3823
3824 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3825 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3826 }
3827 }
3828
3829 void
3830 lp_build_fpstate_set(struct gallivm_state *gallivm,
3831 LLVMValueRef mxcsr_ptr)
3832 {
3833 if (util_cpu_caps.has_sse) {
3834 LLVMBuilderRef builder = gallivm->builder;
3835 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3836 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3837 lp_build_intrinsic(builder,
3838 "llvm.x86.sse.ldmxcsr",
3839 LLVMVoidTypeInContext(gallivm->context),
3840 &mxcsr_ptr, 1, 0);
3841 }
3842 }