gallivm: Basic AVX2 support.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if(a == bld->zero)
545 return b;
546 if(b == bld->zero)
547 return a;
548 if(a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if(bld->type.norm) {
552 const char *intrinsic = NULL;
553
554 if(a == bld->one || b == bld->one)
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if(util_cpu_caps.has_sse2) {
560 if(type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
562 if(type.width == 16)
563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
564 } else if (util_cpu_caps.has_altivec) {
565 if(type.width == 8)
566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
567 if(type.width == 16)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
569 }
570 }
571 if (type.width * type.length == 256) {
572 if(util_cpu_caps.has_avx2) {
573 if(type.width == 8)
574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
575 if(type.width == 16)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
577 }
578 }
579 }
580
581 if (intrinsic)
582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
583 }
584
585 if(type.norm && !type.floating && !type.fixed) {
586 if (type.sign) {
587 uint64_t sign = (uint64_t)1 << (type.width - 1);
588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
590 /* a_clamp_max is the maximum a for positive b,
591 a_clamp_min is the minimum a for negative b. */
592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
595 } else {
596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 }
598 }
599
600 if(LLVMIsConstant(a) && LLVMIsConstant(b))
601 if (type.floating)
602 res = LLVMConstFAdd(a, b);
603 else
604 res = LLVMConstAdd(a, b);
605 else
606 if (type.floating)
607 res = LLVMBuildFAdd(builder, a, b, "");
608 else
609 res = LLVMBuildAdd(builder, a, b, "");
610
611 /* clamp to ceiling of 1.0 */
612 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
614
615 /* XXX clamp to floor of -1 or 0??? */
616
617 return res;
618 }
619
620
621 /** Return the scalar sum of the elements of a.
622 * Should avoid this operation whenever possible.
623 */
624 LLVMValueRef
625 lp_build_horizontal_add(struct lp_build_context *bld,
626 LLVMValueRef a)
627 {
628 LLVMBuilderRef builder = bld->gallivm->builder;
629 const struct lp_type type = bld->type;
630 LLVMValueRef index, res;
631 unsigned i, length;
632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
634 LLVMValueRef vecres, elem2;
635
636 assert(lp_check_value(type, a));
637
638 if (type.length == 1) {
639 return a;
640 }
641
642 assert(!bld->type.norm);
643
644 /*
645 * for byte vectors can do much better with psadbw.
646 * Using repeated shuffle/adds here. Note with multiple vectors
647 * this can be done more efficiently as outlined in the intel
648 * optimization manual.
649 * Note: could cause data rearrangement if used with smaller element
650 * sizes.
651 */
652
653 vecres = a;
654 length = type.length / 2;
655 while (length > 1) {
656 LLVMValueRef vec1, vec2;
657 for (i = 0; i < length; i++) {
658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
660 }
661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
662 LLVMConstVector(shuffles1, length), "");
663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
664 LLVMConstVector(shuffles2, length), "");
665 if (type.floating) {
666 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
667 }
668 else {
669 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
670 }
671 length = length >> 1;
672 }
673
674 /* always have vector of size 2 here */
675 assert(length == 1);
676
677 index = lp_build_const_int32(bld->gallivm, 0);
678 res = LLVMBuildExtractElement(builder, vecres, index, "");
679 index = lp_build_const_int32(bld->gallivm, 1);
680 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
681
682 if (type.floating)
683 res = LLVMBuildFAdd(builder, res, elem2, "");
684 else
685 res = LLVMBuildAdd(builder, res, elem2, "");
686
687 return res;
688 }
689
690 /**
691 * Return the horizontal sums of 4 float vectors as a float4 vector.
692 * This uses the technique as outlined in Intel Optimization Manual.
693 */
694 static LLVMValueRef
695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
696 LLVMValueRef src[4])
697 {
698 struct gallivm_state *gallivm = bld->gallivm;
699 LLVMBuilderRef builder = gallivm->builder;
700 LLVMValueRef shuffles[4];
701 LLVMValueRef tmp[4];
702 LLVMValueRef sumtmp[2], shuftmp[2];
703
704 /* lower half of regs */
705 shuffles[0] = lp_build_const_int32(gallivm, 0);
706 shuffles[1] = lp_build_const_int32(gallivm, 1);
707 shuffles[2] = lp_build_const_int32(gallivm, 4);
708 shuffles[3] = lp_build_const_int32(gallivm, 5);
709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
710 LLVMConstVector(shuffles, 4), "");
711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
712 LLVMConstVector(shuffles, 4), "");
713
714 /* upper half of regs */
715 shuffles[0] = lp_build_const_int32(gallivm, 2);
716 shuffles[1] = lp_build_const_int32(gallivm, 3);
717 shuffles[2] = lp_build_const_int32(gallivm, 6);
718 shuffles[3] = lp_build_const_int32(gallivm, 7);
719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
720 LLVMConstVector(shuffles, 4), "");
721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
722 LLVMConstVector(shuffles, 4), "");
723
724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
726
727 shuffles[0] = lp_build_const_int32(gallivm, 0);
728 shuffles[1] = lp_build_const_int32(gallivm, 2);
729 shuffles[2] = lp_build_const_int32(gallivm, 4);
730 shuffles[3] = lp_build_const_int32(gallivm, 6);
731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
732 LLVMConstVector(shuffles, 4), "");
733
734 shuffles[0] = lp_build_const_int32(gallivm, 1);
735 shuffles[1] = lp_build_const_int32(gallivm, 3);
736 shuffles[2] = lp_build_const_int32(gallivm, 5);
737 shuffles[3] = lp_build_const_int32(gallivm, 7);
738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
739 LLVMConstVector(shuffles, 4), "");
740
741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
742 }
743
744
745 /*
746 * partially horizontally add 2-4 float vectors with length nx4,
747 * i.e. only four adjacent values in each vector will be added,
748 * assuming values are really grouped in 4 which also determines
749 * output order.
750 *
751 * Return a vector of the same length as the initial vectors,
752 * with the excess elements (if any) being undefined.
753 * The element order is independent of number of input vectors.
754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
755 * the output order thus will be
756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
757 */
758 LLVMValueRef
759 lp_build_hadd_partial4(struct lp_build_context *bld,
760 LLVMValueRef vectors[],
761 unsigned num_vecs)
762 {
763 struct gallivm_state *gallivm = bld->gallivm;
764 LLVMBuilderRef builder = gallivm->builder;
765 LLVMValueRef ret_vec;
766 LLVMValueRef tmp[4];
767 const char *intrinsic = NULL;
768
769 assert(num_vecs >= 2 && num_vecs <= 4);
770 assert(bld->type.floating);
771
772 /* only use this with at least 2 vectors, as it is sort of expensive
773 * (depending on cpu) and we always need two horizontal adds anyway,
774 * so a shuffle/add approach might be better.
775 */
776
777 tmp[0] = vectors[0];
778 tmp[1] = vectors[1];
779
780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
782
783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
784 bld->type.length == 4) {
785 intrinsic = "llvm.x86.sse3.hadd.ps";
786 }
787 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
788 bld->type.length == 8) {
789 intrinsic = "llvm.x86.avx.hadd.ps.256";
790 }
791 if (intrinsic) {
792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
793 lp_build_vec_type(gallivm, bld->type),
794 tmp[0], tmp[1]);
795 if (num_vecs > 2) {
796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
797 lp_build_vec_type(gallivm, bld->type),
798 tmp[2], tmp[3]);
799 }
800 else {
801 tmp[1] = tmp[0];
802 }
803 return lp_build_intrinsic_binary(builder, intrinsic,
804 lp_build_vec_type(gallivm, bld->type),
805 tmp[0], tmp[1]);
806 }
807
808 if (bld->type.length == 4) {
809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
810 }
811 else {
812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
813 unsigned j;
814 unsigned num_iter = bld->type.length / 4;
815 struct lp_type parttype = bld->type;
816 parttype.length = 4;
817 for (j = 0; j < num_iter; j++) {
818 LLVMValueRef partsrc[4];
819 unsigned i;
820 for (i = 0; i < 4; i++) {
821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
822 }
823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
824 }
825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
826 }
827 return ret_vec;
828 }
829
830 /**
831 * Generate a - b
832 */
833 LLVMValueRef
834 lp_build_sub(struct lp_build_context *bld,
835 LLVMValueRef a,
836 LLVMValueRef b)
837 {
838 LLVMBuilderRef builder = bld->gallivm->builder;
839 const struct lp_type type = bld->type;
840 LLVMValueRef res;
841
842 assert(lp_check_value(type, a));
843 assert(lp_check_value(type, b));
844
845 if(b == bld->zero)
846 return a;
847 if(a == bld->undef || b == bld->undef)
848 return bld->undef;
849 if(a == b)
850 return bld->zero;
851
852 if(bld->type.norm) {
853 const char *intrinsic = NULL;
854
855 if(b == bld->one)
856 return bld->zero;
857
858 if (!type.floating && !type.fixed) {
859 if (type.width * type.length == 128) {
860 if (util_cpu_caps.has_sse2) {
861 if(type.width == 8)
862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
863 if(type.width == 16)
864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
865 } else if (util_cpu_caps.has_altivec) {
866 if(type.width == 8)
867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
868 if(type.width == 16)
869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
870 }
871 }
872 if (type.width * type.length == 256) {
873 if (util_cpu_caps.has_avx2) {
874 if(type.width == 8)
875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
876 if(type.width == 16)
877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
878 }
879 }
880 }
881
882 if (intrinsic)
883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
884 }
885
886 if(type.norm && !type.floating && !type.fixed) {
887 if (type.sign) {
888 uint64_t sign = (uint64_t)1 << (type.width - 1);
889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
891 /* a_clamp_max is the maximum a for negative b,
892 a_clamp_min is the minimum a for positive b. */
893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
896 } else {
897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
898 }
899 }
900
901 if(LLVMIsConstant(a) && LLVMIsConstant(b))
902 if (type.floating)
903 res = LLVMConstFSub(a, b);
904 else
905 res = LLVMConstSub(a, b);
906 else
907 if (type.floating)
908 res = LLVMBuildFSub(builder, a, b, "");
909 else
910 res = LLVMBuildSub(builder, a, b, "");
911
912 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
914
915 return res;
916 }
917
918
919
920 /**
921 * Normalized multiplication.
922 *
923 * There are several approaches for (using 8-bit normalized multiplication as
924 * an example):
925 *
926 * - alpha plus one
927 *
928 * makes the following approximation to the division (Sree)
929 *
930 * a*b/255 ~= (a*(b + 1)) >> 256
931 *
932 * which is the fastest method that satisfies the following OpenGL criteria of
933 *
934 * 0*0 = 0 and 255*255 = 255
935 *
936 * - geometric series
937 *
938 * takes the geometric series approximation to the division
939 *
940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
941 *
942 * in this case just the first two terms to fit in 16bit arithmetic
943 *
944 * t/255 ~= (t + (t >> 8)) >> 8
945 *
946 * note that just by itself it doesn't satisfies the OpenGL criteria, as
947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
948 * must be used.
949 *
950 * - geometric series plus rounding
951 *
952 * when using a geometric series division instead of truncating the result
953 * use roundoff in the approximation (Jim Blinn)
954 *
955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
956 *
957 * achieving the exact results.
958 *
959 *
960 *
961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
963 * @sa Michael Herf, The "double blend trick", May 2000,
964 * http://www.stereopsis.com/doubleblend.html
965 */
966 static LLVMValueRef
967 lp_build_mul_norm(struct gallivm_state *gallivm,
968 struct lp_type wide_type,
969 LLVMValueRef a, LLVMValueRef b)
970 {
971 LLVMBuilderRef builder = gallivm->builder;
972 struct lp_build_context bld;
973 unsigned n;
974 LLVMValueRef half;
975 LLVMValueRef ab;
976
977 assert(!wide_type.floating);
978 assert(lp_check_value(wide_type, a));
979 assert(lp_check_value(wide_type, b));
980
981 lp_build_context_init(&bld, gallivm, wide_type);
982
983 n = wide_type.width / 2;
984 if (wide_type.sign) {
985 --n;
986 }
987
988 /*
989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
991 */
992
993 /*
994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
995 */
996
997 ab = LLVMBuildMul(builder, a, b, "");
998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
999
1000 /*
1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002 */
1003
1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005 if (wide_type.sign) {
1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008 half = lp_build_select(&bld, sign, minus_half, half);
1009 }
1010 ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012 /* Final division */
1013 ab = lp_build_shr_imm(&bld, ab, n);
1014
1015 return ab;
1016 }
1017
1018 /**
1019 * Generate a * b
1020 */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023 LLVMValueRef a,
1024 LLVMValueRef b)
1025 {
1026 LLVMBuilderRef builder = bld->gallivm->builder;
1027 const struct lp_type type = bld->type;
1028 LLVMValueRef shift;
1029 LLVMValueRef res;
1030
1031 assert(lp_check_value(type, a));
1032 assert(lp_check_value(type, b));
1033
1034 if(a == bld->zero)
1035 return bld->zero;
1036 if(a == bld->one)
1037 return b;
1038 if(b == bld->zero)
1039 return bld->zero;
1040 if(b == bld->one)
1041 return a;
1042 if(a == bld->undef || b == bld->undef)
1043 return bld->undef;
1044
1045 if (!type.floating && !type.fixed && type.norm) {
1046 struct lp_type wide_type = lp_wider_type(type);
1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
1050 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052 /* PMULLW, PSRLW, PADDW */
1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
1057
1058 return ab;
1059 }
1060
1061 if(type.fixed)
1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063 else
1064 shift = NULL;
1065
1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067 if (type.floating)
1068 res = LLVMConstFMul(a, b);
1069 else
1070 res = LLVMConstMul(a, b);
1071 if(shift) {
1072 if(type.sign)
1073 res = LLVMConstAShr(res, shift);
1074 else
1075 res = LLVMConstLShr(res, shift);
1076 }
1077 }
1078 else {
1079 if (type.floating)
1080 res = LLVMBuildFMul(builder, a, b, "");
1081 else
1082 res = LLVMBuildMul(builder, a, b, "");
1083 if(shift) {
1084 if(type.sign)
1085 res = LLVMBuildAShr(builder, res, shift, "");
1086 else
1087 res = LLVMBuildLShr(builder, res, shift, "");
1088 }
1089 }
1090
1091 return res;
1092 }
1093
1094
1095 /* a * b + c */
1096 LLVMValueRef
1097 lp_build_mad(struct lp_build_context *bld,
1098 LLVMValueRef a,
1099 LLVMValueRef b,
1100 LLVMValueRef c)
1101 {
1102 const struct lp_type type = bld->type;
1103 if (type.floating) {
1104 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1105 } else {
1106 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1107 }
1108 }
1109
1110
1111 /**
1112 * Small vector x scale multiplication optimization.
1113 */
1114 LLVMValueRef
1115 lp_build_mul_imm(struct lp_build_context *bld,
1116 LLVMValueRef a,
1117 int b)
1118 {
1119 LLVMBuilderRef builder = bld->gallivm->builder;
1120 LLVMValueRef factor;
1121
1122 assert(lp_check_value(bld->type, a));
1123
1124 if(b == 0)
1125 return bld->zero;
1126
1127 if(b == 1)
1128 return a;
1129
1130 if(b == -1)
1131 return lp_build_negate(bld, a);
1132
1133 if(b == 2 && bld->type.floating)
1134 return lp_build_add(bld, a, a);
1135
1136 if(util_is_power_of_two(b)) {
1137 unsigned shift = ffs(b) - 1;
1138
1139 if(bld->type.floating) {
1140 #if 0
1141 /*
1142 * Power of two multiplication by directly manipulating the exponent.
1143 *
1144 * XXX: This might not be always faster, it will introduce a small error
1145 * for multiplication by zero, and it will produce wrong results
1146 * for Inf and NaN.
1147 */
1148 unsigned mantissa = lp_mantissa(bld->type);
1149 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1150 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1151 a = LLVMBuildAdd(builder, a, factor, "");
1152 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1153 return a;
1154 #endif
1155 }
1156 else {
1157 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1158 return LLVMBuildShl(builder, a, factor, "");
1159 }
1160 }
1161
1162 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1163 return lp_build_mul(bld, a, factor);
1164 }
1165
1166
1167 /**
1168 * Generate a / b
1169 */
1170 LLVMValueRef
1171 lp_build_div(struct lp_build_context *bld,
1172 LLVMValueRef a,
1173 LLVMValueRef b)
1174 {
1175 LLVMBuilderRef builder = bld->gallivm->builder;
1176 const struct lp_type type = bld->type;
1177
1178 assert(lp_check_value(type, a));
1179 assert(lp_check_value(type, b));
1180
1181 if(a == bld->zero)
1182 return bld->zero;
1183 if(a == bld->one && type.floating)
1184 return lp_build_rcp(bld, b);
1185 if(b == bld->zero)
1186 return bld->undef;
1187 if(b == bld->one)
1188 return a;
1189 if(a == bld->undef || b == bld->undef)
1190 return bld->undef;
1191
1192 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1193 if (type.floating)
1194 return LLVMConstFDiv(a, b);
1195 else if (type.sign)
1196 return LLVMConstSDiv(a, b);
1197 else
1198 return LLVMConstUDiv(a, b);
1199 }
1200
1201 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1202 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1203 type.floating)
1204 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1205
1206 if (type.floating)
1207 return LLVMBuildFDiv(builder, a, b, "");
1208 else if (type.sign)
1209 return LLVMBuildSDiv(builder, a, b, "");
1210 else
1211 return LLVMBuildUDiv(builder, a, b, "");
1212 }
1213
1214
1215 /**
1216 * Linear interpolation helper.
1217 *
1218 * @param normalized whether we are interpolating normalized values,
1219 * encoded in normalized integers, twice as wide.
1220 *
1221 * @sa http://www.stereopsis.com/doubleblend.html
1222 */
1223 static inline LLVMValueRef
1224 lp_build_lerp_simple(struct lp_build_context *bld,
1225 LLVMValueRef x,
1226 LLVMValueRef v0,
1227 LLVMValueRef v1,
1228 unsigned flags)
1229 {
1230 unsigned half_width = bld->type.width/2;
1231 LLVMBuilderRef builder = bld->gallivm->builder;
1232 LLVMValueRef delta;
1233 LLVMValueRef res;
1234
1235 assert(lp_check_value(bld->type, x));
1236 assert(lp_check_value(bld->type, v0));
1237 assert(lp_check_value(bld->type, v1));
1238
1239 delta = lp_build_sub(bld, v1, v0);
1240
1241 if (bld->type.floating) {
1242 assert(flags == 0);
1243 return lp_build_mad(bld, x, delta, v0);
1244 }
1245
1246 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1247 if (!bld->type.sign) {
1248 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1249 /*
1250 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1251 * most-significant-bit to the lowest-significant-bit, so that
1252 * later we can just divide by 2**n instead of 2**n - 1.
1253 */
1254
1255 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1256 }
1257
1258 /* (x * delta) >> n */
1259 res = lp_build_mul(bld, x, delta);
1260 res = lp_build_shr_imm(bld, res, half_width);
1261 } else {
1262 /*
1263 * The rescaling trick above doesn't work for signed numbers, so
1264 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1265 * instead.
1266 */
1267 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1268 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1269 }
1270 } else {
1271 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1272 res = lp_build_mul(bld, x, delta);
1273 }
1274
1275 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1276 /*
1277 * At this point both res and v0 only use the lower half of the bits,
1278 * the rest is zero. Instead of add / mask, do add with half wide type.
1279 */
1280 struct lp_type narrow_type;
1281 struct lp_build_context narrow_bld;
1282
1283 memset(&narrow_type, 0, sizeof narrow_type);
1284 narrow_type.sign = bld->type.sign;
1285 narrow_type.width = bld->type.width/2;
1286 narrow_type.length = bld->type.length*2;
1287
1288 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1289 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1290 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1291 res = lp_build_add(&narrow_bld, v0, res);
1292 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1293 } else {
1294 res = lp_build_add(bld, v0, res);
1295
1296 if (bld->type.fixed) {
1297 /*
1298 * We need to mask out the high order bits when lerping 8bit
1299 * normalized colors stored on 16bits
1300 */
1301 /* XXX: This step is necessary for lerping 8bit colors stored on
1302 * 16bits, but it will be wrong for true fixed point use cases.
1303 * Basically we need a more powerful lp_type, capable of further
1304 * distinguishing the values interpretation from the value storage.
1305 */
1306 LLVMValueRef low_bits;
1307 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1308 res = LLVMBuildAnd(builder, res, low_bits, "");
1309 }
1310 }
1311
1312 return res;
1313 }
1314
1315
1316 /**
1317 * Linear interpolation.
1318 */
1319 LLVMValueRef
1320 lp_build_lerp(struct lp_build_context *bld,
1321 LLVMValueRef x,
1322 LLVMValueRef v0,
1323 LLVMValueRef v1,
1324 unsigned flags)
1325 {
1326 const struct lp_type type = bld->type;
1327 LLVMValueRef res;
1328
1329 assert(lp_check_value(type, x));
1330 assert(lp_check_value(type, v0));
1331 assert(lp_check_value(type, v1));
1332
1333 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1334
1335 if (type.norm) {
1336 struct lp_type wide_type;
1337 struct lp_build_context wide_bld;
1338 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1339
1340 assert(type.length >= 2);
1341
1342 /*
1343 * Create a wider integer type, enough to hold the
1344 * intermediate result of the multiplication.
1345 */
1346 memset(&wide_type, 0, sizeof wide_type);
1347 wide_type.sign = type.sign;
1348 wide_type.width = type.width*2;
1349 wide_type.length = type.length/2;
1350
1351 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1352
1353 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1354 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1355 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1356
1357 /*
1358 * Lerp both halves.
1359 */
1360
1361 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1362
1363 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1364 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1365
1366 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1367 } else {
1368 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1369 }
1370
1371 return res;
1372 }
1373
1374
1375 /**
1376 * Bilinear interpolation.
1377 *
1378 * Values indices are in v_{yx}.
1379 */
1380 LLVMValueRef
1381 lp_build_lerp_2d(struct lp_build_context *bld,
1382 LLVMValueRef x,
1383 LLVMValueRef y,
1384 LLVMValueRef v00,
1385 LLVMValueRef v01,
1386 LLVMValueRef v10,
1387 LLVMValueRef v11,
1388 unsigned flags)
1389 {
1390 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1391 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1392 return lp_build_lerp(bld, y, v0, v1, flags);
1393 }
1394
1395
1396 LLVMValueRef
1397 lp_build_lerp_3d(struct lp_build_context *bld,
1398 LLVMValueRef x,
1399 LLVMValueRef y,
1400 LLVMValueRef z,
1401 LLVMValueRef v000,
1402 LLVMValueRef v001,
1403 LLVMValueRef v010,
1404 LLVMValueRef v011,
1405 LLVMValueRef v100,
1406 LLVMValueRef v101,
1407 LLVMValueRef v110,
1408 LLVMValueRef v111,
1409 unsigned flags)
1410 {
1411 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1412 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1413 return lp_build_lerp(bld, z, v0, v1, flags);
1414 }
1415
1416
1417 /**
1418 * Generate min(a, b)
1419 * Do checks for special cases but not for nans.
1420 */
1421 LLVMValueRef
1422 lp_build_min(struct lp_build_context *bld,
1423 LLVMValueRef a,
1424 LLVMValueRef b)
1425 {
1426 assert(lp_check_value(bld->type, a));
1427 assert(lp_check_value(bld->type, b));
1428
1429 if(a == bld->undef || b == bld->undef)
1430 return bld->undef;
1431
1432 if(a == b)
1433 return a;
1434
1435 if (bld->type.norm) {
1436 if (!bld->type.sign) {
1437 if (a == bld->zero || b == bld->zero) {
1438 return bld->zero;
1439 }
1440 }
1441 if(a == bld->one)
1442 return b;
1443 if(b == bld->one)
1444 return a;
1445 }
1446
1447 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1448 }
1449
1450
1451 /**
1452 * Generate min(a, b)
1453 * NaN's are handled according to the behavior specified by the
1454 * nan_behavior argument.
1455 */
1456 LLVMValueRef
1457 lp_build_min_ext(struct lp_build_context *bld,
1458 LLVMValueRef a,
1459 LLVMValueRef b,
1460 enum gallivm_nan_behavior nan_behavior)
1461 {
1462 assert(lp_check_value(bld->type, a));
1463 assert(lp_check_value(bld->type, b));
1464
1465 if(a == bld->undef || b == bld->undef)
1466 return bld->undef;
1467
1468 if(a == b)
1469 return a;
1470
1471 if (bld->type.norm) {
1472 if (!bld->type.sign) {
1473 if (a == bld->zero || b == bld->zero) {
1474 return bld->zero;
1475 }
1476 }
1477 if(a == bld->one)
1478 return b;
1479 if(b == bld->one)
1480 return a;
1481 }
1482
1483 return lp_build_min_simple(bld, a, b, nan_behavior);
1484 }
1485
1486 /**
1487 * Generate max(a, b)
1488 * Do checks for special cases, but NaN behavior is undefined.
1489 */
1490 LLVMValueRef
1491 lp_build_max(struct lp_build_context *bld,
1492 LLVMValueRef a,
1493 LLVMValueRef b)
1494 {
1495 assert(lp_check_value(bld->type, a));
1496 assert(lp_check_value(bld->type, b));
1497
1498 if(a == bld->undef || b == bld->undef)
1499 return bld->undef;
1500
1501 if(a == b)
1502 return a;
1503
1504 if(bld->type.norm) {
1505 if(a == bld->one || b == bld->one)
1506 return bld->one;
1507 if (!bld->type.sign) {
1508 if (a == bld->zero) {
1509 return b;
1510 }
1511 if (b == bld->zero) {
1512 return a;
1513 }
1514 }
1515 }
1516
1517 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1518 }
1519
1520
1521 /**
1522 * Generate max(a, b)
1523 * Checks for special cases.
1524 * NaN's are handled according to the behavior specified by the
1525 * nan_behavior argument.
1526 */
1527 LLVMValueRef
1528 lp_build_max_ext(struct lp_build_context *bld,
1529 LLVMValueRef a,
1530 LLVMValueRef b,
1531 enum gallivm_nan_behavior nan_behavior)
1532 {
1533 assert(lp_check_value(bld->type, a));
1534 assert(lp_check_value(bld->type, b));
1535
1536 if(a == bld->undef || b == bld->undef)
1537 return bld->undef;
1538
1539 if(a == b)
1540 return a;
1541
1542 if(bld->type.norm) {
1543 if(a == bld->one || b == bld->one)
1544 return bld->one;
1545 if (!bld->type.sign) {
1546 if (a == bld->zero) {
1547 return b;
1548 }
1549 if (b == bld->zero) {
1550 return a;
1551 }
1552 }
1553 }
1554
1555 return lp_build_max_simple(bld, a, b, nan_behavior);
1556 }
1557
1558 /**
1559 * Generate clamp(a, min, max)
1560 * NaN behavior (for any of a, min, max) is undefined.
1561 * Do checks for special cases.
1562 */
1563 LLVMValueRef
1564 lp_build_clamp(struct lp_build_context *bld,
1565 LLVMValueRef a,
1566 LLVMValueRef min,
1567 LLVMValueRef max)
1568 {
1569 assert(lp_check_value(bld->type, a));
1570 assert(lp_check_value(bld->type, min));
1571 assert(lp_check_value(bld->type, max));
1572
1573 a = lp_build_min(bld, a, max);
1574 a = lp_build_max(bld, a, min);
1575 return a;
1576 }
1577
1578
1579 /**
1580 * Generate clamp(a, 0, 1)
1581 * A NaN will get converted to zero.
1582 */
1583 LLVMValueRef
1584 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1585 LLVMValueRef a)
1586 {
1587 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1588 a = lp_build_min(bld, a, bld->one);
1589 return a;
1590 }
1591
1592
1593 /**
1594 * Generate abs(a)
1595 */
1596 LLVMValueRef
1597 lp_build_abs(struct lp_build_context *bld,
1598 LLVMValueRef a)
1599 {
1600 LLVMBuilderRef builder = bld->gallivm->builder;
1601 const struct lp_type type = bld->type;
1602 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1603
1604 assert(lp_check_value(type, a));
1605
1606 if(!type.sign)
1607 return a;
1608
1609 if(type.floating) {
1610 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1611 /* Workaround llvm.org/PR27332 */
1612 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1613 unsigned long long absMask = ~(1ULL << (type.width - 1));
1614 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1615 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1616 a = LLVMBuildAnd(builder, a, mask, "");
1617 a = LLVMBuildBitCast(builder, a, vec_type, "");
1618 return a;
1619 } else {
1620 char intrinsic[32];
1621 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1622 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1623 }
1624 }
1625
1626 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1627 switch(type.width) {
1628 case 8:
1629 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1630 case 16:
1631 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1632 case 32:
1633 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1634 }
1635 }
1636 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
1637 switch(type.width) {
1638 case 8:
1639 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1640 case 16:
1641 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1642 case 32:
1643 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1644 }
1645 }
1646 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1647 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1648 (type.width == 8 || type.width == 16 || type.width == 32)) {
1649 debug_printf("%s: inefficient code, should split vectors manually\n",
1650 __FUNCTION__);
1651 }
1652
1653 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1654 }
1655
1656
1657 LLVMValueRef
1658 lp_build_negate(struct lp_build_context *bld,
1659 LLVMValueRef a)
1660 {
1661 LLVMBuilderRef builder = bld->gallivm->builder;
1662
1663 assert(lp_check_value(bld->type, a));
1664
1665 if (bld->type.floating)
1666 a = LLVMBuildFNeg(builder, a, "");
1667 else
1668 a = LLVMBuildNeg(builder, a, "");
1669
1670 return a;
1671 }
1672
1673
1674 /** Return -1, 0 or +1 depending on the sign of a */
1675 LLVMValueRef
1676 lp_build_sgn(struct lp_build_context *bld,
1677 LLVMValueRef a)
1678 {
1679 LLVMBuilderRef builder = bld->gallivm->builder;
1680 const struct lp_type type = bld->type;
1681 LLVMValueRef cond;
1682 LLVMValueRef res;
1683
1684 assert(lp_check_value(type, a));
1685
1686 /* Handle non-zero case */
1687 if(!type.sign) {
1688 /* if not zero then sign must be positive */
1689 res = bld->one;
1690 }
1691 else if(type.floating) {
1692 LLVMTypeRef vec_type;
1693 LLVMTypeRef int_type;
1694 LLVMValueRef mask;
1695 LLVMValueRef sign;
1696 LLVMValueRef one;
1697 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1698
1699 int_type = lp_build_int_vec_type(bld->gallivm, type);
1700 vec_type = lp_build_vec_type(bld->gallivm, type);
1701 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1702
1703 /* Take the sign bit and add it to 1 constant */
1704 sign = LLVMBuildBitCast(builder, a, int_type, "");
1705 sign = LLVMBuildAnd(builder, sign, mask, "");
1706 one = LLVMConstBitCast(bld->one, int_type);
1707 res = LLVMBuildOr(builder, sign, one, "");
1708 res = LLVMBuildBitCast(builder, res, vec_type, "");
1709 }
1710 else
1711 {
1712 /* signed int/norm/fixed point */
1713 /* could use psign with sse3 and appropriate vectors here */
1714 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1715 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1716 res = lp_build_select(bld, cond, bld->one, minus_one);
1717 }
1718
1719 /* Handle zero */
1720 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1721 res = lp_build_select(bld, cond, bld->zero, res);
1722
1723 return res;
1724 }
1725
1726
1727 /**
1728 * Set the sign of float vector 'a' according to 'sign'.
1729 * If sign==0, return abs(a).
1730 * If sign==1, return -abs(a);
1731 * Other values for sign produce undefined results.
1732 */
1733 LLVMValueRef
1734 lp_build_set_sign(struct lp_build_context *bld,
1735 LLVMValueRef a, LLVMValueRef sign)
1736 {
1737 LLVMBuilderRef builder = bld->gallivm->builder;
1738 const struct lp_type type = bld->type;
1739 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1740 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1741 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1742 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1743 ~((unsigned long long) 1 << (type.width - 1)));
1744 LLVMValueRef val, res;
1745
1746 assert(type.floating);
1747 assert(lp_check_value(type, a));
1748
1749 /* val = reinterpret_cast<int>(a) */
1750 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1751 /* val = val & mask */
1752 val = LLVMBuildAnd(builder, val, mask, "");
1753 /* sign = sign << shift */
1754 sign = LLVMBuildShl(builder, sign, shift, "");
1755 /* res = val | sign */
1756 res = LLVMBuildOr(builder, val, sign, "");
1757 /* res = reinterpret_cast<float>(res) */
1758 res = LLVMBuildBitCast(builder, res, vec_type, "");
1759
1760 return res;
1761 }
1762
1763
1764 /**
1765 * Convert vector of (or scalar) int to vector of (or scalar) float.
1766 */
1767 LLVMValueRef
1768 lp_build_int_to_float(struct lp_build_context *bld,
1769 LLVMValueRef a)
1770 {
1771 LLVMBuilderRef builder = bld->gallivm->builder;
1772 const struct lp_type type = bld->type;
1773 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1774
1775 assert(type.floating);
1776
1777 return LLVMBuildSIToFP(builder, a, vec_type, "");
1778 }
1779
1780 static boolean
1781 arch_rounding_available(const struct lp_type type)
1782 {
1783 if ((util_cpu_caps.has_sse4_1 &&
1784 (type.length == 1 || type.width*type.length == 128)) ||
1785 (util_cpu_caps.has_avx && type.width*type.length == 256))
1786 return TRUE;
1787 else if ((util_cpu_caps.has_altivec &&
1788 (type.width == 32 && type.length == 4)))
1789 return TRUE;
1790
1791 return FALSE;
1792 }
1793
1794 enum lp_build_round_mode
1795 {
1796 LP_BUILD_ROUND_NEAREST = 0,
1797 LP_BUILD_ROUND_FLOOR = 1,
1798 LP_BUILD_ROUND_CEIL = 2,
1799 LP_BUILD_ROUND_TRUNCATE = 3
1800 };
1801
1802 static inline LLVMValueRef
1803 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1804 LLVMValueRef a)
1805 {
1806 LLVMBuilderRef builder = bld->gallivm->builder;
1807 const struct lp_type type = bld->type;
1808 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1809 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1810 const char *intrinsic;
1811 LLVMValueRef res;
1812
1813 assert(type.floating);
1814 /* using the double precision conversions is a bit more complicated */
1815 assert(type.width == 32);
1816
1817 assert(lp_check_value(type, a));
1818 assert(util_cpu_caps.has_sse2);
1819
1820 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1821 if (type.length == 1) {
1822 LLVMTypeRef vec_type;
1823 LLVMValueRef undef;
1824 LLVMValueRef arg;
1825 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1826
1827 vec_type = LLVMVectorType(bld->elem_type, 4);
1828
1829 intrinsic = "llvm.x86.sse.cvtss2si";
1830
1831 undef = LLVMGetUndef(vec_type);
1832
1833 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1834
1835 res = lp_build_intrinsic_unary(builder, intrinsic,
1836 ret_type, arg);
1837 }
1838 else {
1839 if (type.width* type.length == 128) {
1840 intrinsic = "llvm.x86.sse2.cvtps2dq";
1841 }
1842 else {
1843 assert(type.width*type.length == 256);
1844 assert(util_cpu_caps.has_avx);
1845
1846 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1847 }
1848 res = lp_build_intrinsic_unary(builder, intrinsic,
1849 ret_type, a);
1850 }
1851
1852 return res;
1853 }
1854
1855
1856 /*
1857 */
1858 static inline LLVMValueRef
1859 lp_build_round_altivec(struct lp_build_context *bld,
1860 LLVMValueRef a,
1861 enum lp_build_round_mode mode)
1862 {
1863 LLVMBuilderRef builder = bld->gallivm->builder;
1864 const struct lp_type type = bld->type;
1865 const char *intrinsic = NULL;
1866
1867 assert(type.floating);
1868
1869 assert(lp_check_value(type, a));
1870 assert(util_cpu_caps.has_altivec);
1871
1872 (void)type;
1873
1874 switch (mode) {
1875 case LP_BUILD_ROUND_NEAREST:
1876 intrinsic = "llvm.ppc.altivec.vrfin";
1877 break;
1878 case LP_BUILD_ROUND_FLOOR:
1879 intrinsic = "llvm.ppc.altivec.vrfim";
1880 break;
1881 case LP_BUILD_ROUND_CEIL:
1882 intrinsic = "llvm.ppc.altivec.vrfip";
1883 break;
1884 case LP_BUILD_ROUND_TRUNCATE:
1885 intrinsic = "llvm.ppc.altivec.vrfiz";
1886 break;
1887 }
1888
1889 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1890 }
1891
1892 static inline LLVMValueRef
1893 lp_build_round_arch(struct lp_build_context *bld,
1894 LLVMValueRef a,
1895 enum lp_build_round_mode mode)
1896 {
1897 if (util_cpu_caps.has_sse4_1) {
1898 LLVMBuilderRef builder = bld->gallivm->builder;
1899 const struct lp_type type = bld->type;
1900 const char *intrinsic_root;
1901 char intrinsic[32];
1902
1903 assert(type.floating);
1904 assert(lp_check_value(type, a));
1905 (void)type;
1906
1907 switch (mode) {
1908 case LP_BUILD_ROUND_NEAREST:
1909 intrinsic_root = "llvm.nearbyint";
1910 break;
1911 case LP_BUILD_ROUND_FLOOR:
1912 intrinsic_root = "llvm.floor";
1913 break;
1914 case LP_BUILD_ROUND_CEIL:
1915 intrinsic_root = "llvm.ceil";
1916 break;
1917 case LP_BUILD_ROUND_TRUNCATE:
1918 intrinsic_root = "llvm.trunc";
1919 break;
1920 }
1921
1922 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1923 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1924 }
1925 else /* (util_cpu_caps.has_altivec) */
1926 return lp_build_round_altivec(bld, a, mode);
1927 }
1928
1929 /**
1930 * Return the integer part of a float (vector) value (== round toward zero).
1931 * The returned value is a float (vector).
1932 * Ex: trunc(-1.5) = -1.0
1933 */
1934 LLVMValueRef
1935 lp_build_trunc(struct lp_build_context *bld,
1936 LLVMValueRef a)
1937 {
1938 LLVMBuilderRef builder = bld->gallivm->builder;
1939 const struct lp_type type = bld->type;
1940
1941 assert(type.floating);
1942 assert(lp_check_value(type, a));
1943
1944 if (arch_rounding_available(type)) {
1945 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1946 }
1947 else {
1948 const struct lp_type type = bld->type;
1949 struct lp_type inttype;
1950 struct lp_build_context intbld;
1951 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1952 LLVMValueRef trunc, res, anosign, mask;
1953 LLVMTypeRef int_vec_type = bld->int_vec_type;
1954 LLVMTypeRef vec_type = bld->vec_type;
1955
1956 assert(type.width == 32); /* might want to handle doubles at some point */
1957
1958 inttype = type;
1959 inttype.floating = 0;
1960 lp_build_context_init(&intbld, bld->gallivm, inttype);
1961
1962 /* round by truncation */
1963 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1964 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1965
1966 /* mask out sign bit */
1967 anosign = lp_build_abs(bld, a);
1968 /*
1969 * mask out all values if anosign > 2^24
1970 * This should work both for large ints (all rounding is no-op for them
1971 * because such floats are always exact) as well as special cases like
1972 * NaNs, Infs (taking advantage of the fact they use max exponent).
1973 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1974 */
1975 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1976 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1977 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1978 return lp_build_select(bld, mask, a, res);
1979 }
1980 }
1981
1982
1983 /**
1984 * Return float (vector) rounded to nearest integer (vector). The returned
1985 * value is a float (vector).
1986 * Ex: round(0.9) = 1.0
1987 * Ex: round(-1.5) = -2.0
1988 */
1989 LLVMValueRef
1990 lp_build_round(struct lp_build_context *bld,
1991 LLVMValueRef a)
1992 {
1993 LLVMBuilderRef builder = bld->gallivm->builder;
1994 const struct lp_type type = bld->type;
1995
1996 assert(type.floating);
1997 assert(lp_check_value(type, a));
1998
1999 if (arch_rounding_available(type)) {
2000 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2001 }
2002 else {
2003 const struct lp_type type = bld->type;
2004 struct lp_type inttype;
2005 struct lp_build_context intbld;
2006 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2007 LLVMValueRef res, anosign, mask;
2008 LLVMTypeRef int_vec_type = bld->int_vec_type;
2009 LLVMTypeRef vec_type = bld->vec_type;
2010
2011 assert(type.width == 32); /* might want to handle doubles at some point */
2012
2013 inttype = type;
2014 inttype.floating = 0;
2015 lp_build_context_init(&intbld, bld->gallivm, inttype);
2016
2017 res = lp_build_iround(bld, a);
2018 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2019
2020 /* mask out sign bit */
2021 anosign = lp_build_abs(bld, a);
2022 /*
2023 * mask out all values if anosign > 2^24
2024 * This should work both for large ints (all rounding is no-op for them
2025 * because such floats are always exact) as well as special cases like
2026 * NaNs, Infs (taking advantage of the fact they use max exponent).
2027 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2028 */
2029 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2030 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2031 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2032 return lp_build_select(bld, mask, a, res);
2033 }
2034 }
2035
2036
2037 /**
2038 * Return floor of float (vector), result is a float (vector)
2039 * Ex: floor(1.1) = 1.0
2040 * Ex: floor(-1.1) = -2.0
2041 */
2042 LLVMValueRef
2043 lp_build_floor(struct lp_build_context *bld,
2044 LLVMValueRef a)
2045 {
2046 LLVMBuilderRef builder = bld->gallivm->builder;
2047 const struct lp_type type = bld->type;
2048
2049 assert(type.floating);
2050 assert(lp_check_value(type, a));
2051
2052 if (arch_rounding_available(type)) {
2053 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2054 }
2055 else {
2056 const struct lp_type type = bld->type;
2057 struct lp_type inttype;
2058 struct lp_build_context intbld;
2059 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2060 LLVMValueRef trunc, res, anosign, mask;
2061 LLVMTypeRef int_vec_type = bld->int_vec_type;
2062 LLVMTypeRef vec_type = bld->vec_type;
2063
2064 if (type.width != 32) {
2065 char intrinsic[32];
2066 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2067 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2068 }
2069
2070 assert(type.width == 32); /* might want to handle doubles at some point */
2071
2072 inttype = type;
2073 inttype.floating = 0;
2074 lp_build_context_init(&intbld, bld->gallivm, inttype);
2075
2076 /* round by truncation */
2077 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2078 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2079
2080 if (type.sign) {
2081 LLVMValueRef tmp;
2082
2083 /*
2084 * fix values if rounding is wrong (for non-special cases)
2085 * - this is the case if trunc > a
2086 */
2087 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2088 /* tmp = trunc > a ? 1.0 : 0.0 */
2089 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2090 tmp = lp_build_and(&intbld, mask, tmp);
2091 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2092 res = lp_build_sub(bld, res, tmp);
2093 }
2094
2095 /* mask out sign bit */
2096 anosign = lp_build_abs(bld, a);
2097 /*
2098 * mask out all values if anosign > 2^24
2099 * This should work both for large ints (all rounding is no-op for them
2100 * because such floats are always exact) as well as special cases like
2101 * NaNs, Infs (taking advantage of the fact they use max exponent).
2102 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2103 */
2104 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2105 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2106 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2107 return lp_build_select(bld, mask, a, res);
2108 }
2109 }
2110
2111
2112 /**
2113 * Return ceiling of float (vector), returning float (vector).
2114 * Ex: ceil( 1.1) = 2.0
2115 * Ex: ceil(-1.1) = -1.0
2116 */
2117 LLVMValueRef
2118 lp_build_ceil(struct lp_build_context *bld,
2119 LLVMValueRef a)
2120 {
2121 LLVMBuilderRef builder = bld->gallivm->builder;
2122 const struct lp_type type = bld->type;
2123
2124 assert(type.floating);
2125 assert(lp_check_value(type, a));
2126
2127 if (arch_rounding_available(type)) {
2128 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2129 }
2130 else {
2131 const struct lp_type type = bld->type;
2132 struct lp_type inttype;
2133 struct lp_build_context intbld;
2134 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2135 LLVMValueRef trunc, res, anosign, mask, tmp;
2136 LLVMTypeRef int_vec_type = bld->int_vec_type;
2137 LLVMTypeRef vec_type = bld->vec_type;
2138
2139 if (type.width != 32) {
2140 char intrinsic[32];
2141 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2142 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2143 }
2144
2145 assert(type.width == 32); /* might want to handle doubles at some point */
2146
2147 inttype = type;
2148 inttype.floating = 0;
2149 lp_build_context_init(&intbld, bld->gallivm, inttype);
2150
2151 /* round by truncation */
2152 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2153 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2154
2155 /*
2156 * fix values if rounding is wrong (for non-special cases)
2157 * - this is the case if trunc < a
2158 */
2159 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2160 /* tmp = trunc < a ? 1.0 : 0.0 */
2161 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2162 tmp = lp_build_and(&intbld, mask, tmp);
2163 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2164 res = lp_build_add(bld, trunc, tmp);
2165
2166 /* mask out sign bit */
2167 anosign = lp_build_abs(bld, a);
2168 /*
2169 * mask out all values if anosign > 2^24
2170 * This should work both for large ints (all rounding is no-op for them
2171 * because such floats are always exact) as well as special cases like
2172 * NaNs, Infs (taking advantage of the fact they use max exponent).
2173 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2174 */
2175 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2176 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2177 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2178 return lp_build_select(bld, mask, a, res);
2179 }
2180 }
2181
2182
2183 /**
2184 * Return fractional part of 'a' computed as a - floor(a)
2185 * Typically used in texture coord arithmetic.
2186 */
2187 LLVMValueRef
2188 lp_build_fract(struct lp_build_context *bld,
2189 LLVMValueRef a)
2190 {
2191 assert(bld->type.floating);
2192 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2193 }
2194
2195
2196 /**
2197 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2198 * against 0.99999(9). (Will also return that value for NaNs.)
2199 */
2200 static inline LLVMValueRef
2201 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2202 {
2203 LLVMValueRef max;
2204
2205 /* this is the largest number smaller than 1.0 representable as float */
2206 max = lp_build_const_vec(bld->gallivm, bld->type,
2207 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2208 return lp_build_min_ext(bld, fract, max,
2209 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2210 }
2211
2212
2213 /**
2214 * Same as lp_build_fract, but guarantees that the result is always smaller
2215 * than one. Will also return the smaller-than-one value for infs, NaNs.
2216 */
2217 LLVMValueRef
2218 lp_build_fract_safe(struct lp_build_context *bld,
2219 LLVMValueRef a)
2220 {
2221 return clamp_fract(bld, lp_build_fract(bld, a));
2222 }
2223
2224
2225 /**
2226 * Return the integer part of a float (vector) value (== round toward zero).
2227 * The returned value is an integer (vector).
2228 * Ex: itrunc(-1.5) = -1
2229 */
2230 LLVMValueRef
2231 lp_build_itrunc(struct lp_build_context *bld,
2232 LLVMValueRef a)
2233 {
2234 LLVMBuilderRef builder = bld->gallivm->builder;
2235 const struct lp_type type = bld->type;
2236 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2237
2238 assert(type.floating);
2239 assert(lp_check_value(type, a));
2240
2241 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2242 }
2243
2244
2245 /**
2246 * Return float (vector) rounded to nearest integer (vector). The returned
2247 * value is an integer (vector).
2248 * Ex: iround(0.9) = 1
2249 * Ex: iround(-1.5) = -2
2250 */
2251 LLVMValueRef
2252 lp_build_iround(struct lp_build_context *bld,
2253 LLVMValueRef a)
2254 {
2255 LLVMBuilderRef builder = bld->gallivm->builder;
2256 const struct lp_type type = bld->type;
2257 LLVMTypeRef int_vec_type = bld->int_vec_type;
2258 LLVMValueRef res;
2259
2260 assert(type.floating);
2261
2262 assert(lp_check_value(type, a));
2263
2264 if ((util_cpu_caps.has_sse2 &&
2265 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2266 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2267 return lp_build_iround_nearest_sse2(bld, a);
2268 }
2269 if (arch_rounding_available(type)) {
2270 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2271 }
2272 else {
2273 LLVMValueRef half;
2274
2275 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2276
2277 if (type.sign) {
2278 LLVMTypeRef vec_type = bld->vec_type;
2279 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2280 (unsigned long long)1 << (type.width - 1));
2281 LLVMValueRef sign;
2282
2283 /* get sign bit */
2284 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2285 sign = LLVMBuildAnd(builder, sign, mask, "");
2286
2287 /* sign * 0.5 */
2288 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2289 half = LLVMBuildOr(builder, sign, half, "");
2290 half = LLVMBuildBitCast(builder, half, vec_type, "");
2291 }
2292
2293 res = LLVMBuildFAdd(builder, a, half, "");
2294 }
2295
2296 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2297
2298 return res;
2299 }
2300
2301
2302 /**
2303 * Return floor of float (vector), result is an int (vector)
2304 * Ex: ifloor(1.1) = 1.0
2305 * Ex: ifloor(-1.1) = -2.0
2306 */
2307 LLVMValueRef
2308 lp_build_ifloor(struct lp_build_context *bld,
2309 LLVMValueRef a)
2310 {
2311 LLVMBuilderRef builder = bld->gallivm->builder;
2312 const struct lp_type type = bld->type;
2313 LLVMTypeRef int_vec_type = bld->int_vec_type;
2314 LLVMValueRef res;
2315
2316 assert(type.floating);
2317 assert(lp_check_value(type, a));
2318
2319 res = a;
2320 if (type.sign) {
2321 if (arch_rounding_available(type)) {
2322 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2323 }
2324 else {
2325 struct lp_type inttype;
2326 struct lp_build_context intbld;
2327 LLVMValueRef trunc, itrunc, mask;
2328
2329 assert(type.floating);
2330 assert(lp_check_value(type, a));
2331
2332 inttype = type;
2333 inttype.floating = 0;
2334 lp_build_context_init(&intbld, bld->gallivm, inttype);
2335
2336 /* round by truncation */
2337 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2338 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2339
2340 /*
2341 * fix values if rounding is wrong (for non-special cases)
2342 * - this is the case if trunc > a
2343 * The results of doing this with NaNs, very large values etc.
2344 * are undefined but this seems to be the case anyway.
2345 */
2346 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2347 /* cheapie minus one with mask since the mask is minus one / zero */
2348 return lp_build_add(&intbld, itrunc, mask);
2349 }
2350 }
2351
2352 /* round to nearest (toward zero) */
2353 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2354
2355 return res;
2356 }
2357
2358
2359 /**
2360 * Return ceiling of float (vector), returning int (vector).
2361 * Ex: iceil( 1.1) = 2
2362 * Ex: iceil(-1.1) = -1
2363 */
2364 LLVMValueRef
2365 lp_build_iceil(struct lp_build_context *bld,
2366 LLVMValueRef a)
2367 {
2368 LLVMBuilderRef builder = bld->gallivm->builder;
2369 const struct lp_type type = bld->type;
2370 LLVMTypeRef int_vec_type = bld->int_vec_type;
2371 LLVMValueRef res;
2372
2373 assert(type.floating);
2374 assert(lp_check_value(type, a));
2375
2376 if (arch_rounding_available(type)) {
2377 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2378 }
2379 else {
2380 struct lp_type inttype;
2381 struct lp_build_context intbld;
2382 LLVMValueRef trunc, itrunc, mask;
2383
2384 assert(type.floating);
2385 assert(lp_check_value(type, a));
2386
2387 inttype = type;
2388 inttype.floating = 0;
2389 lp_build_context_init(&intbld, bld->gallivm, inttype);
2390
2391 /* round by truncation */
2392 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2393 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2394
2395 /*
2396 * fix values if rounding is wrong (for non-special cases)
2397 * - this is the case if trunc < a
2398 * The results of doing this with NaNs, very large values etc.
2399 * are undefined but this seems to be the case anyway.
2400 */
2401 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2402 /* cheapie plus one with mask since the mask is minus one / zero */
2403 return lp_build_sub(&intbld, itrunc, mask);
2404 }
2405
2406 /* round to nearest (toward zero) */
2407 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2408
2409 return res;
2410 }
2411
2412
2413 /**
2414 * Combined ifloor() & fract().
2415 *
2416 * Preferred to calling the functions separately, as it will ensure that the
2417 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2418 */
2419 void
2420 lp_build_ifloor_fract(struct lp_build_context *bld,
2421 LLVMValueRef a,
2422 LLVMValueRef *out_ipart,
2423 LLVMValueRef *out_fpart)
2424 {
2425 LLVMBuilderRef builder = bld->gallivm->builder;
2426 const struct lp_type type = bld->type;
2427 LLVMValueRef ipart;
2428
2429 assert(type.floating);
2430 assert(lp_check_value(type, a));
2431
2432 if (arch_rounding_available(type)) {
2433 /*
2434 * floor() is easier.
2435 */
2436
2437 ipart = lp_build_floor(bld, a);
2438 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2439 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2440 }
2441 else {
2442 /*
2443 * ifloor() is easier.
2444 */
2445
2446 *out_ipart = lp_build_ifloor(bld, a);
2447 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2448 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2449 }
2450 }
2451
2452
2453 /**
2454 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2455 * always smaller than one.
2456 */
2457 void
2458 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2459 LLVMValueRef a,
2460 LLVMValueRef *out_ipart,
2461 LLVMValueRef *out_fpart)
2462 {
2463 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2464 *out_fpart = clamp_fract(bld, *out_fpart);
2465 }
2466
2467
2468 LLVMValueRef
2469 lp_build_sqrt(struct lp_build_context *bld,
2470 LLVMValueRef a)
2471 {
2472 LLVMBuilderRef builder = bld->gallivm->builder;
2473 const struct lp_type type = bld->type;
2474 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2475 char intrinsic[32];
2476
2477 assert(lp_check_value(type, a));
2478
2479 assert(type.floating);
2480 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2481
2482 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2483 }
2484
2485
2486 /**
2487 * Do one Newton-Raphson step to improve reciprocate precision:
2488 *
2489 * x_{i+1} = x_i * (2 - a * x_i)
2490 *
2491 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2492 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2493 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2494 * halo. It would be necessary to clamp the argument to prevent this.
2495 *
2496 * See also:
2497 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2498 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2499 */
2500 static inline LLVMValueRef
2501 lp_build_rcp_refine(struct lp_build_context *bld,
2502 LLVMValueRef a,
2503 LLVMValueRef rcp_a)
2504 {
2505 LLVMBuilderRef builder = bld->gallivm->builder;
2506 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2507 LLVMValueRef res;
2508
2509 res = LLVMBuildFMul(builder, a, rcp_a, "");
2510 res = LLVMBuildFSub(builder, two, res, "");
2511 res = LLVMBuildFMul(builder, rcp_a, res, "");
2512
2513 return res;
2514 }
2515
2516
2517 LLVMValueRef
2518 lp_build_rcp(struct lp_build_context *bld,
2519 LLVMValueRef a)
2520 {
2521 LLVMBuilderRef builder = bld->gallivm->builder;
2522 const struct lp_type type = bld->type;
2523
2524 assert(lp_check_value(type, a));
2525
2526 if(a == bld->zero)
2527 return bld->undef;
2528 if(a == bld->one)
2529 return bld->one;
2530 if(a == bld->undef)
2531 return bld->undef;
2532
2533 assert(type.floating);
2534
2535 if(LLVMIsConstant(a))
2536 return LLVMConstFDiv(bld->one, a);
2537
2538 /*
2539 * We don't use RCPPS because:
2540 * - it only has 10bits of precision
2541 * - it doesn't even get the reciprocate of 1.0 exactly
2542 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2543 * - for recent processors the benefit over DIVPS is marginal, a case
2544 * dependent
2545 *
2546 * We could still use it on certain processors if benchmarks show that the
2547 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2548 * particular uses that require less workarounds.
2549 */
2550
2551 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2552 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2553 const unsigned num_iterations = 0;
2554 LLVMValueRef res;
2555 unsigned i;
2556 const char *intrinsic = NULL;
2557
2558 if (type.length == 4) {
2559 intrinsic = "llvm.x86.sse.rcp.ps";
2560 }
2561 else {
2562 intrinsic = "llvm.x86.avx.rcp.ps.256";
2563 }
2564
2565 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2566
2567 for (i = 0; i < num_iterations; ++i) {
2568 res = lp_build_rcp_refine(bld, a, res);
2569 }
2570
2571 return res;
2572 }
2573
2574 return LLVMBuildFDiv(builder, bld->one, a, "");
2575 }
2576
2577
2578 /**
2579 * Do one Newton-Raphson step to improve rsqrt precision:
2580 *
2581 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2582 *
2583 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2584 */
2585 static inline LLVMValueRef
2586 lp_build_rsqrt_refine(struct lp_build_context *bld,
2587 LLVMValueRef a,
2588 LLVMValueRef rsqrt_a)
2589 {
2590 LLVMBuilderRef builder = bld->gallivm->builder;
2591 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2592 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2593 LLVMValueRef res;
2594
2595 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2596 res = LLVMBuildFMul(builder, a, res, "");
2597 res = LLVMBuildFSub(builder, three, res, "");
2598 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2599 res = LLVMBuildFMul(builder, half, res, "");
2600
2601 return res;
2602 }
2603
2604
2605 /**
2606 * Generate 1/sqrt(a).
2607 * Result is undefined for values < 0, infinity for +0.
2608 */
2609 LLVMValueRef
2610 lp_build_rsqrt(struct lp_build_context *bld,
2611 LLVMValueRef a)
2612 {
2613 const struct lp_type type = bld->type;
2614
2615 assert(lp_check_value(type, a));
2616
2617 assert(type.floating);
2618
2619 /*
2620 * This should be faster but all denormals will end up as infinity.
2621 */
2622 if (0 && lp_build_fast_rsqrt_available(type)) {
2623 const unsigned num_iterations = 1;
2624 LLVMValueRef res;
2625 unsigned i;
2626
2627 /* rsqrt(1.0) != 1.0 here */
2628 res = lp_build_fast_rsqrt(bld, a);
2629
2630 if (num_iterations) {
2631 /*
2632 * Newton-Raphson will result in NaN instead of infinity for zero,
2633 * and NaN instead of zero for infinity.
2634 * Also, need to ensure rsqrt(1.0) == 1.0.
2635 * All numbers smaller than FLT_MIN will result in +infinity
2636 * (rsqrtps treats all denormals as zero).
2637 */
2638 LLVMValueRef cmp;
2639 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2640 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2641
2642 for (i = 0; i < num_iterations; ++i) {
2643 res = lp_build_rsqrt_refine(bld, a, res);
2644 }
2645 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2646 res = lp_build_select(bld, cmp, inf, res);
2647 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2648 res = lp_build_select(bld, cmp, bld->zero, res);
2649 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2650 res = lp_build_select(bld, cmp, bld->one, res);
2651 }
2652
2653 return res;
2654 }
2655
2656 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2657 }
2658
2659 /**
2660 * If there's a fast (inaccurate) rsqrt instruction available
2661 * (caller may want to avoid to call rsqrt_fast if it's not available,
2662 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2663 * unavailable it would result in sqrt/div/mul so obviously
2664 * much better to just call sqrt, skipping both div and mul).
2665 */
2666 boolean
2667 lp_build_fast_rsqrt_available(struct lp_type type)
2668 {
2669 assert(type.floating);
2670
2671 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2672 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2673 return true;
2674 }
2675 return false;
2676 }
2677
2678
2679 /**
2680 * Generate 1/sqrt(a).
2681 * Result is undefined for values < 0, infinity for +0.
2682 * Precision is limited, only ~10 bits guaranteed
2683 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2684 */
2685 LLVMValueRef
2686 lp_build_fast_rsqrt(struct lp_build_context *bld,
2687 LLVMValueRef a)
2688 {
2689 LLVMBuilderRef builder = bld->gallivm->builder;
2690 const struct lp_type type = bld->type;
2691
2692 assert(lp_check_value(type, a));
2693
2694 if (lp_build_fast_rsqrt_available(type)) {
2695 const char *intrinsic = NULL;
2696
2697 if (type.length == 4) {
2698 intrinsic = "llvm.x86.sse.rsqrt.ps";
2699 }
2700 else {
2701 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2702 }
2703 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2704 }
2705 else {
2706 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2707 }
2708 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2709 }
2710
2711
2712 /**
2713 * Generate sin(a) or cos(a) using polynomial approximation.
2714 * TODO: it might be worth recognizing sin and cos using same source
2715 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2716 * would be way cheaper than calculating (nearly) everything twice...
2717 * Not sure it's common enough to be worth bothering however, scs
2718 * opcode could also benefit from calculating both though.
2719 */
2720 static LLVMValueRef
2721 lp_build_sin_or_cos(struct lp_build_context *bld,
2722 LLVMValueRef a,
2723 boolean cos)
2724 {
2725 struct gallivm_state *gallivm = bld->gallivm;
2726 LLVMBuilderRef b = gallivm->builder;
2727 struct lp_type int_type = lp_int_type(bld->type);
2728
2729 /*
2730 * take the absolute value,
2731 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2732 */
2733
2734 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2735 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2736
2737 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2738 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2739
2740 /*
2741 * scale by 4/Pi
2742 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2743 */
2744
2745 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2746 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2747
2748 /*
2749 * store the integer part of y in mm0
2750 * emm2 = _mm_cvttps_epi32(y);
2751 */
2752
2753 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2754
2755 /*
2756 * j=(j+1) & (~1) (see the cephes sources)
2757 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2758 */
2759
2760 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2761 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2762 /*
2763 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2764 */
2765 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2766 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2767
2768 /*
2769 * y = _mm_cvtepi32_ps(emm2);
2770 */
2771 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2772
2773 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2774 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2775 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2776 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2777
2778 /*
2779 * Argument used for poly selection and sign bit determination
2780 * is different for sin vs. cos.
2781 */
2782 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2783 emm2_and;
2784
2785 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2786 LLVMBuildNot(b, emm2_2, ""), ""),
2787 const_29, "sign_bit") :
2788 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2789 LLVMBuildShl(b, emm2_add,
2790 const_29, ""), ""),
2791 sign_mask, "sign_bit");
2792
2793 /*
2794 * get the polynom selection mask
2795 * there is one polynom for 0 <= x <= Pi/4
2796 * and another one for Pi/4<x<=Pi/2
2797 * Both branches will be computed.
2798 *
2799 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2800 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2801 */
2802
2803 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2804 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2805 int_type, PIPE_FUNC_EQUAL,
2806 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2807
2808 /*
2809 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2810 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2811 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2812 */
2813 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2814 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2815 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2816
2817 /*
2818 * The magic pass: "Extended precision modular arithmetic"
2819 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2820 */
2821 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2822 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2823 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2824
2825 /*
2826 * Evaluate the first polynom (0 <= x <= Pi/4)
2827 *
2828 * z = _mm_mul_ps(x,x);
2829 */
2830 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2831
2832 /*
2833 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2834 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2835 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2836 */
2837 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2838 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2839 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2840
2841 /*
2842 * y = *(v4sf*)_ps_coscof_p0;
2843 * y = _mm_mul_ps(y, z);
2844 */
2845 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2846 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2847 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2848 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2849
2850
2851 /*
2852 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2853 * y = _mm_sub_ps(y, tmp);
2854 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2855 */
2856 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2857 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2858 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2859 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2860 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2861
2862 /*
2863 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2864 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2865 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2866 */
2867 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2868 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2869 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2870
2871 /*
2872 * Evaluate the second polynom (Pi/4 <= x <= 0)
2873 *
2874 * y2 = *(v4sf*)_ps_sincof_p0;
2875 * y2 = _mm_mul_ps(y2, z);
2876 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2877 * y2 = _mm_mul_ps(y2, z);
2878 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2879 * y2 = _mm_mul_ps(y2, z);
2880 * y2 = _mm_mul_ps(y2, x);
2881 * y2 = _mm_add_ps(y2, x);
2882 */
2883
2884 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2885 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2886 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2887 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2888
2889 /*
2890 * select the correct result from the two polynoms
2891 * xmm3 = poly_mask;
2892 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2893 * y = _mm_andnot_ps(xmm3, y);
2894 * y = _mm_or_ps(y,y2);
2895 */
2896 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2897 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2898 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2899 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2900 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2901 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2902
2903 /*
2904 * update the sign
2905 * y = _mm_xor_ps(y, sign_bit);
2906 */
2907 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2908 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2909
2910 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2911
2912 /* clamp output to be within [-1, 1] */
2913 y_result = lp_build_clamp(bld, y_result,
2914 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2915 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2916 /* If a is -inf, inf or NaN then return NaN */
2917 y_result = lp_build_select(bld, isfinite, y_result,
2918 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2919 return y_result;
2920 }
2921
2922
2923 /**
2924 * Generate sin(a)
2925 */
2926 LLVMValueRef
2927 lp_build_sin(struct lp_build_context *bld,
2928 LLVMValueRef a)
2929 {
2930 return lp_build_sin_or_cos(bld, a, FALSE);
2931 }
2932
2933
2934 /**
2935 * Generate cos(a)
2936 */
2937 LLVMValueRef
2938 lp_build_cos(struct lp_build_context *bld,
2939 LLVMValueRef a)
2940 {
2941 return lp_build_sin_or_cos(bld, a, TRUE);
2942 }
2943
2944
2945 /**
2946 * Generate pow(x, y)
2947 */
2948 LLVMValueRef
2949 lp_build_pow(struct lp_build_context *bld,
2950 LLVMValueRef x,
2951 LLVMValueRef y)
2952 {
2953 /* TODO: optimize the constant case */
2954 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2955 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2956 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2957 __FUNCTION__);
2958 }
2959
2960 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2961 }
2962
2963
2964 /**
2965 * Generate exp(x)
2966 */
2967 LLVMValueRef
2968 lp_build_exp(struct lp_build_context *bld,
2969 LLVMValueRef x)
2970 {
2971 /* log2(e) = 1/log(2) */
2972 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2973 1.4426950408889634);
2974
2975 assert(lp_check_value(bld->type, x));
2976
2977 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2978 }
2979
2980
2981 /**
2982 * Generate log(x)
2983 * Behavior is undefined with infs, 0s and nans
2984 */
2985 LLVMValueRef
2986 lp_build_log(struct lp_build_context *bld,
2987 LLVMValueRef x)
2988 {
2989 /* log(2) */
2990 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2991 0.69314718055994529);
2992
2993 assert(lp_check_value(bld->type, x));
2994
2995 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2996 }
2997
2998 /**
2999 * Generate log(x) that handles edge cases (infs, 0s and nans)
3000 */
3001 LLVMValueRef
3002 lp_build_log_safe(struct lp_build_context *bld,
3003 LLVMValueRef x)
3004 {
3005 /* log(2) */
3006 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3007 0.69314718055994529);
3008
3009 assert(lp_check_value(bld->type, x));
3010
3011 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3012 }
3013
3014
3015 /**
3016 * Generate polynomial.
3017 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3018 */
3019 LLVMValueRef
3020 lp_build_polynomial(struct lp_build_context *bld,
3021 LLVMValueRef x,
3022 const double *coeffs,
3023 unsigned num_coeffs)
3024 {
3025 const struct lp_type type = bld->type;
3026 LLVMValueRef even = NULL, odd = NULL;
3027 LLVMValueRef x2;
3028 unsigned i;
3029
3030 assert(lp_check_value(bld->type, x));
3031
3032 /* TODO: optimize the constant case */
3033 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3034 LLVMIsConstant(x)) {
3035 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3036 __FUNCTION__);
3037 }
3038
3039 /*
3040 * Calculate odd and even terms seperately to decrease data dependency
3041 * Ex:
3042 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3043 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3044 */
3045 x2 = lp_build_mul(bld, x, x);
3046
3047 for (i = num_coeffs; i--; ) {
3048 LLVMValueRef coeff;
3049
3050 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3051
3052 if (i % 2 == 0) {
3053 if (even)
3054 even = lp_build_mad(bld, x2, even, coeff);
3055 else
3056 even = coeff;
3057 } else {
3058 if (odd)
3059 odd = lp_build_mad(bld, x2, odd, coeff);
3060 else
3061 odd = coeff;
3062 }
3063 }
3064
3065 if (odd)
3066 return lp_build_mad(bld, odd, x, even);
3067 else if (even)
3068 return even;
3069 else
3070 return bld->undef;
3071 }
3072
3073
3074 /**
3075 * Minimax polynomial fit of 2**x, in range [0, 1[
3076 */
3077 const double lp_build_exp2_polynomial[] = {
3078 #if EXP_POLY_DEGREE == 5
3079 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3080 0.693153073200168932794,
3081 0.240153617044375388211,
3082 0.0558263180532956664775,
3083 0.00898934009049466391101,
3084 0.00187757667519147912699
3085 #elif EXP_POLY_DEGREE == 4
3086 1.00000259337069434683,
3087 0.693003834469974940458,
3088 0.24144275689150793076,
3089 0.0520114606103070150235,
3090 0.0135341679161270268764
3091 #elif EXP_POLY_DEGREE == 3
3092 0.999925218562710312959,
3093 0.695833540494823811697,
3094 0.226067155427249155588,
3095 0.0780245226406372992967
3096 #elif EXP_POLY_DEGREE == 2
3097 1.00172476321474503578,
3098 0.657636275736077639316,
3099 0.33718943461968720704
3100 #else
3101 #error
3102 #endif
3103 };
3104
3105
3106 LLVMValueRef
3107 lp_build_exp2(struct lp_build_context *bld,
3108 LLVMValueRef x)
3109 {
3110 LLVMBuilderRef builder = bld->gallivm->builder;
3111 const struct lp_type type = bld->type;
3112 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3113 LLVMValueRef ipart = NULL;
3114 LLVMValueRef fpart = NULL;
3115 LLVMValueRef expipart = NULL;
3116 LLVMValueRef expfpart = NULL;
3117 LLVMValueRef res = NULL;
3118
3119 assert(lp_check_value(bld->type, x));
3120
3121 /* TODO: optimize the constant case */
3122 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3123 LLVMIsConstant(x)) {
3124 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3125 __FUNCTION__);
3126 }
3127
3128 assert(type.floating && type.width == 32);
3129
3130 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3131 * the result is INF and if it's smaller than -126.9 the result is 0 */
3132 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3133 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3134 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3135 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3136
3137 /* ipart = floor(x) */
3138 /* fpart = x - ipart */
3139 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3140
3141 /* expipart = (float) (1 << ipart) */
3142 expipart = LLVMBuildAdd(builder, ipart,
3143 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3144 expipart = LLVMBuildShl(builder, expipart,
3145 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3146 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3147
3148 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3149 ARRAY_SIZE(lp_build_exp2_polynomial));
3150
3151 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3152
3153 return res;
3154 }
3155
3156
3157
3158 /**
3159 * Extract the exponent of a IEEE-754 floating point value.
3160 *
3161 * Optionally apply an integer bias.
3162 *
3163 * Result is an integer value with
3164 *
3165 * ifloor(log2(x)) + bias
3166 */
3167 LLVMValueRef
3168 lp_build_extract_exponent(struct lp_build_context *bld,
3169 LLVMValueRef x,
3170 int bias)
3171 {
3172 LLVMBuilderRef builder = bld->gallivm->builder;
3173 const struct lp_type type = bld->type;
3174 unsigned mantissa = lp_mantissa(type);
3175 LLVMValueRef res;
3176
3177 assert(type.floating);
3178
3179 assert(lp_check_value(bld->type, x));
3180
3181 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3182
3183 res = LLVMBuildLShr(builder, x,
3184 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3185 res = LLVMBuildAnd(builder, res,
3186 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3187 res = LLVMBuildSub(builder, res,
3188 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3189
3190 return res;
3191 }
3192
3193
3194 /**
3195 * Extract the mantissa of the a floating.
3196 *
3197 * Result is a floating point value with
3198 *
3199 * x / floor(log2(x))
3200 */
3201 LLVMValueRef
3202 lp_build_extract_mantissa(struct lp_build_context *bld,
3203 LLVMValueRef x)
3204 {
3205 LLVMBuilderRef builder = bld->gallivm->builder;
3206 const struct lp_type type = bld->type;
3207 unsigned mantissa = lp_mantissa(type);
3208 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3209 (1ULL << mantissa) - 1);
3210 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3211 LLVMValueRef res;
3212
3213 assert(lp_check_value(bld->type, x));
3214
3215 assert(type.floating);
3216
3217 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3218
3219 /* res = x / 2**ipart */
3220 res = LLVMBuildAnd(builder, x, mantmask, "");
3221 res = LLVMBuildOr(builder, res, one, "");
3222 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3223
3224 return res;
3225 }
3226
3227
3228
3229 /**
3230 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3231 * These coefficients can be generate with
3232 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3233 */
3234 const double lp_build_log2_polynomial[] = {
3235 #if LOG_POLY_DEGREE == 5
3236 2.88539008148777786488L,
3237 0.961796878841293367824L,
3238 0.577058946784739859012L,
3239 0.412914355135828735411L,
3240 0.308591899232910175289L,
3241 0.352376952300281371868L,
3242 #elif LOG_POLY_DEGREE == 4
3243 2.88539009343309178325L,
3244 0.961791550404184197881L,
3245 0.577440339438736392009L,
3246 0.403343858251329912514L,
3247 0.406718052498846252698L,
3248 #elif LOG_POLY_DEGREE == 3
3249 2.88538959748872753838L,
3250 0.961932915889597772928L,
3251 0.571118517972136195241L,
3252 0.493997535084709500285L,
3253 #else
3254 #error
3255 #endif
3256 };
3257
3258 /**
3259 * See http://www.devmaster.net/forums/showthread.php?p=43580
3260 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3261 * http://www.nezumi.demon.co.uk/consult/logx.htm
3262 *
3263 * If handle_edge_cases is true the function will perform computations
3264 * to match the required D3D10+ behavior for each of the edge cases.
3265 * That means that if input is:
3266 * - less than zero (to and including -inf) then NaN will be returned
3267 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3268 * - +infinity, then +infinity will be returned
3269 * - NaN, then NaN will be returned
3270 *
3271 * Those checks are fairly expensive so if you don't need them make sure
3272 * handle_edge_cases is false.
3273 */
3274 void
3275 lp_build_log2_approx(struct lp_build_context *bld,
3276 LLVMValueRef x,
3277 LLVMValueRef *p_exp,
3278 LLVMValueRef *p_floor_log2,
3279 LLVMValueRef *p_log2,
3280 boolean handle_edge_cases)
3281 {
3282 LLVMBuilderRef builder = bld->gallivm->builder;
3283 const struct lp_type type = bld->type;
3284 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3285 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3286
3287 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3288 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3289 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3290
3291 LLVMValueRef i = NULL;
3292 LLVMValueRef y = NULL;
3293 LLVMValueRef z = NULL;
3294 LLVMValueRef exp = NULL;
3295 LLVMValueRef mant = NULL;
3296 LLVMValueRef logexp = NULL;
3297 LLVMValueRef p_z = NULL;
3298 LLVMValueRef res = NULL;
3299
3300 assert(lp_check_value(bld->type, x));
3301
3302 if(p_exp || p_floor_log2 || p_log2) {
3303 /* TODO: optimize the constant case */
3304 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3305 LLVMIsConstant(x)) {
3306 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3307 __FUNCTION__);
3308 }
3309
3310 assert(type.floating && type.width == 32);
3311
3312 /*
3313 * We don't explicitly handle denormalized numbers. They will yield a
3314 * result in the neighbourhood of -127, which appears to be adequate
3315 * enough.
3316 */
3317
3318 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3319
3320 /* exp = (float) exponent(x) */
3321 exp = LLVMBuildAnd(builder, i, expmask, "");
3322 }
3323
3324 if(p_floor_log2 || p_log2) {
3325 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3326 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3327 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3328 }
3329
3330 if (p_log2) {
3331 /* mant = 1 + (float) mantissa(x) */
3332 mant = LLVMBuildAnd(builder, i, mantmask, "");
3333 mant = LLVMBuildOr(builder, mant, one, "");
3334 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3335
3336 /* y = (mant - 1) / (mant + 1) */
3337 y = lp_build_div(bld,
3338 lp_build_sub(bld, mant, bld->one),
3339 lp_build_add(bld, mant, bld->one)
3340 );
3341
3342 /* z = y^2 */
3343 z = lp_build_mul(bld, y, y);
3344
3345 /* compute P(z) */
3346 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3347 ARRAY_SIZE(lp_build_log2_polynomial));
3348
3349 /* y * P(z) + logexp */
3350 res = lp_build_mad(bld, y, p_z, logexp);
3351
3352 if (type.floating && handle_edge_cases) {
3353 LLVMValueRef negmask, infmask, zmask;
3354 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3355 lp_build_const_vec(bld->gallivm, type, 0.0f));
3356 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3357 lp_build_const_vec(bld->gallivm, type, 0.0f));
3358 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3359 lp_build_const_vec(bld->gallivm, type, INFINITY));
3360
3361 /* If x is qual to inf make sure we return inf */
3362 res = lp_build_select(bld, infmask,
3363 lp_build_const_vec(bld->gallivm, type, INFINITY),
3364 res);
3365 /* If x is qual to 0, return -inf */
3366 res = lp_build_select(bld, zmask,
3367 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3368 res);
3369 /* If x is nan or less than 0, return nan */
3370 res = lp_build_select(bld, negmask,
3371 lp_build_const_vec(bld->gallivm, type, NAN),
3372 res);
3373 }
3374 }
3375
3376 if (p_exp) {
3377 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3378 *p_exp = exp;
3379 }
3380
3381 if (p_floor_log2)
3382 *p_floor_log2 = logexp;
3383
3384 if (p_log2)
3385 *p_log2 = res;
3386 }
3387
3388
3389 /*
3390 * log2 implementation which doesn't have special code to
3391 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3392 * the results for those cases are undefined.
3393 */
3394 LLVMValueRef
3395 lp_build_log2(struct lp_build_context *bld,
3396 LLVMValueRef x)
3397 {
3398 LLVMValueRef res;
3399 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3400 return res;
3401 }
3402
3403 /*
3404 * Version of log2 which handles all edge cases.
3405 * Look at documentation of lp_build_log2_approx for
3406 * description of the behavior for each of the edge cases.
3407 */
3408 LLVMValueRef
3409 lp_build_log2_safe(struct lp_build_context *bld,
3410 LLVMValueRef x)
3411 {
3412 LLVMValueRef res;
3413 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3414 return res;
3415 }
3416
3417
3418 /**
3419 * Faster (and less accurate) log2.
3420 *
3421 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3422 *
3423 * Piece-wise linear approximation, with exact results when x is a
3424 * power of two.
3425 *
3426 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3427 */
3428 LLVMValueRef
3429 lp_build_fast_log2(struct lp_build_context *bld,
3430 LLVMValueRef x)
3431 {
3432 LLVMBuilderRef builder = bld->gallivm->builder;
3433 LLVMValueRef ipart;
3434 LLVMValueRef fpart;
3435
3436 assert(lp_check_value(bld->type, x));
3437
3438 assert(bld->type.floating);
3439
3440 /* ipart = floor(log2(x)) - 1 */
3441 ipart = lp_build_extract_exponent(bld, x, -1);
3442 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3443
3444 /* fpart = x / 2**ipart */
3445 fpart = lp_build_extract_mantissa(bld, x);
3446
3447 /* ipart + fpart */
3448 return LLVMBuildFAdd(builder, ipart, fpart, "");
3449 }
3450
3451
3452 /**
3453 * Fast implementation of iround(log2(x)).
3454 *
3455 * Not an approximation -- it should give accurate results all the time.
3456 */
3457 LLVMValueRef
3458 lp_build_ilog2(struct lp_build_context *bld,
3459 LLVMValueRef x)
3460 {
3461 LLVMBuilderRef builder = bld->gallivm->builder;
3462 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3463 LLVMValueRef ipart;
3464
3465 assert(bld->type.floating);
3466
3467 assert(lp_check_value(bld->type, x));
3468
3469 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3470 x = LLVMBuildFMul(builder, x, sqrt2, "");
3471
3472 /* ipart = floor(log2(x) + 0.5) */
3473 ipart = lp_build_extract_exponent(bld, x, 0);
3474
3475 return ipart;
3476 }
3477
3478 LLVMValueRef
3479 lp_build_mod(struct lp_build_context *bld,
3480 LLVMValueRef x,
3481 LLVMValueRef y)
3482 {
3483 LLVMBuilderRef builder = bld->gallivm->builder;
3484 LLVMValueRef res;
3485 const struct lp_type type = bld->type;
3486
3487 assert(lp_check_value(type, x));
3488 assert(lp_check_value(type, y));
3489
3490 if (type.floating)
3491 res = LLVMBuildFRem(builder, x, y, "");
3492 else if (type.sign)
3493 res = LLVMBuildSRem(builder, x, y, "");
3494 else
3495 res = LLVMBuildURem(builder, x, y, "");
3496 return res;
3497 }
3498
3499
3500 /*
3501 * For floating inputs it creates and returns a mask
3502 * which is all 1's for channels which are NaN.
3503 * Channels inside x which are not NaN will be 0.
3504 */
3505 LLVMValueRef
3506 lp_build_isnan(struct lp_build_context *bld,
3507 LLVMValueRef x)
3508 {
3509 LLVMValueRef mask;
3510 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3511
3512 assert(bld->type.floating);
3513 assert(lp_check_value(bld->type, x));
3514
3515 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3516 "isnotnan");
3517 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3518 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3519 return mask;
3520 }
3521
3522 /* Returns all 1's for floating point numbers that are
3523 * finite numbers and returns all zeros for -inf,
3524 * inf and nan's */
3525 LLVMValueRef
3526 lp_build_isfinite(struct lp_build_context *bld,
3527 LLVMValueRef x)
3528 {
3529 LLVMBuilderRef builder = bld->gallivm->builder;
3530 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3531 struct lp_type int_type = lp_int_type(bld->type);
3532 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3533 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3534 0x7f800000);
3535
3536 if (!bld->type.floating) {
3537 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3538 }
3539 assert(bld->type.floating);
3540 assert(lp_check_value(bld->type, x));
3541 assert(bld->type.width == 32);
3542
3543 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3544 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3545 intx, infornan32);
3546 }
3547
3548 /*
3549 * Returns true if the number is nan or inf and false otherwise.
3550 * The input has to be a floating point vector.
3551 */
3552 LLVMValueRef
3553 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3554 const struct lp_type type,
3555 LLVMValueRef x)
3556 {
3557 LLVMBuilderRef builder = gallivm->builder;
3558 struct lp_type int_type = lp_int_type(type);
3559 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3560 0x7f800000);
3561 LLVMValueRef ret;
3562
3563 assert(type.floating);
3564
3565 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3566 ret = LLVMBuildAnd(builder, ret, const0, "");
3567 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3568 ret, const0);
3569
3570 return ret;
3571 }
3572
3573
3574 LLVMValueRef
3575 lp_build_fpstate_get(struct gallivm_state *gallivm)
3576 {
3577 if (util_cpu_caps.has_sse) {
3578 LLVMBuilderRef builder = gallivm->builder;
3579 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3580 gallivm,
3581 LLVMInt32TypeInContext(gallivm->context),
3582 "mxcsr_ptr");
3583 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3584 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3585 lp_build_intrinsic(builder,
3586 "llvm.x86.sse.stmxcsr",
3587 LLVMVoidTypeInContext(gallivm->context),
3588 &mxcsr_ptr8, 1, 0);
3589 return mxcsr_ptr;
3590 }
3591 return 0;
3592 }
3593
3594 void
3595 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3596 boolean zero)
3597 {
3598 if (util_cpu_caps.has_sse) {
3599 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3600 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3601
3602 LLVMBuilderRef builder = gallivm->builder;
3603 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3604 LLVMValueRef mxcsr =
3605 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3606
3607 if (util_cpu_caps.has_daz) {
3608 /* Enable denormals are zero mode */
3609 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3610 }
3611 if (zero) {
3612 mxcsr = LLVMBuildOr(builder, mxcsr,
3613 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3614 } else {
3615 mxcsr = LLVMBuildAnd(builder, mxcsr,
3616 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3617 }
3618
3619 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3620 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3621 }
3622 }
3623
3624 void
3625 lp_build_fpstate_set(struct gallivm_state *gallivm,
3626 LLVMValueRef mxcsr_ptr)
3627 {
3628 if (util_cpu_caps.has_sse) {
3629 LLVMBuilderRef builder = gallivm->builder;
3630 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3631 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3632 lp_build_intrinsic(builder,
3633 "llvm.x86.sse.ldmxcsr",
3634 LLVMVoidTypeInContext(gallivm->context),
3635 &mxcsr_ptr, 1, 0);
3636 }
3637 }