gallivm: Workaround LLVM PR 27332.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67 #if defined(PIPE_ARCH_SSE)
68 #include <xmmintrin.h>
69 #endif
70
71 #ifndef _MM_DENORMALS_ZERO_MASK
72 #define _MM_DENORMALS_ZERO_MASK 0x0040
73 #endif
74
75 #ifndef _MM_FLUSH_ZERO_MASK
76 #define _MM_FLUSH_ZERO_MASK 0x8000
77 #endif
78
79 #define EXP_POLY_DEGREE 5
80
81 #define LOG_POLY_DEGREE 4
82
83
84 /**
85 * Generate min(a, b)
86 * No checks for special case values of a or b = 1 or 0 are done.
87 * NaN's are handled according to the behavior specified by the
88 * nan_behavior argument.
89 */
90 static LLVMValueRef
91 lp_build_min_simple(struct lp_build_context *bld,
92 LLVMValueRef a,
93 LLVMValueRef b,
94 enum gallivm_nan_behavior nan_behavior)
95 {
96 const struct lp_type type = bld->type;
97 const char *intrinsic = NULL;
98 unsigned intr_size = 0;
99 LLVMValueRef cond;
100
101 assert(lp_check_value(type, a));
102 assert(lp_check_value(type, b));
103
104 /* TODO: optimize the constant case */
105
106 if (type.floating && util_cpu_caps.has_sse) {
107 if (type.width == 32) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse.min.ss";
110 intr_size = 128;
111 }
112 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse.min.ps";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.ps.256";
118 intr_size = 256;
119 }
120 }
121 if (type.width == 64 && util_cpu_caps.has_sse2) {
122 if (type.length == 1) {
123 intrinsic = "llvm.x86.sse2.min.sd";
124 intr_size = 128;
125 }
126 else if (type.length == 2 || !util_cpu_caps.has_avx) {
127 intrinsic = "llvm.x86.sse2.min.pd";
128 intr_size = 128;
129 }
130 else {
131 intrinsic = "llvm.x86.avx.min.pd.256";
132 intr_size = 256;
133 }
134 }
135 }
136 else if (type.floating && util_cpu_caps.has_altivec) {
137 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
138 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140 __FUNCTION__);
141 }
142 if (type.width == 32 && type.length == 4) {
143 intrinsic = "llvm.ppc.altivec.vminfp";
144 intr_size = 128;
145 }
146 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
147 intr_size = 128;
148 if ((type.width == 8 || type.width == 16) &&
149 (type.width * type.length <= 64) &&
150 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
151 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
152 __FUNCTION__);
153 }
154 if (type.width == 8 && !type.sign) {
155 intrinsic = "llvm.x86.sse2.pminu.b";
156 }
157 else if (type.width == 16 && type.sign) {
158 intrinsic = "llvm.x86.sse2.pmins.w";
159 }
160 if (util_cpu_caps.has_sse4_1) {
161 if (type.width == 8 && type.sign) {
162 intrinsic = "llvm.x86.sse41.pminsb";
163 }
164 if (type.width == 16 && !type.sign) {
165 intrinsic = "llvm.x86.sse41.pminuw";
166 }
167 if (type.width == 32 && !type.sign) {
168 intrinsic = "llvm.x86.sse41.pminud";
169 }
170 if (type.width == 32 && type.sign) {
171 intrinsic = "llvm.x86.sse41.pminsd";
172 }
173 }
174 } else if (util_cpu_caps.has_altivec) {
175 intr_size = 128;
176 if (type.width == 8) {
177 if (!type.sign) {
178 intrinsic = "llvm.ppc.altivec.vminub";
179 } else {
180 intrinsic = "llvm.ppc.altivec.vminsb";
181 }
182 } else if (type.width == 16) {
183 if (!type.sign) {
184 intrinsic = "llvm.ppc.altivec.vminuh";
185 } else {
186 intrinsic = "llvm.ppc.altivec.vminsh";
187 }
188 } else if (type.width == 32) {
189 if (!type.sign) {
190 intrinsic = "llvm.ppc.altivec.vminuw";
191 } else {
192 intrinsic = "llvm.ppc.altivec.vminsw";
193 }
194 }
195 }
196
197 if (intrinsic) {
198 /* We need to handle nan's for floating point numbers. If one of the
199 * inputs is nan the other should be returned (required by both D3D10+
200 * and OpenCL).
201 * The sse intrinsics return the second operator in case of nan by
202 * default so we need to special code to handle those.
203 */
204 if (util_cpu_caps.has_sse && type.floating &&
205 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
206 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
207 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
208 LLVMValueRef isnan, min;
209 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
210 type,
211 intr_size, a, b);
212 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
213 isnan = lp_build_isnan(bld, b);
214 return lp_build_select(bld, isnan, a, min);
215 } else {
216 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
217 isnan = lp_build_isnan(bld, a);
218 return lp_build_select(bld, isnan, a, min);
219 }
220 } else {
221 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
222 type,
223 intr_size, a, b);
224 }
225 }
226
227 if (type.floating) {
228 switch (nan_behavior) {
229 case GALLIVM_NAN_RETURN_NAN: {
230 LLVMValueRef isnan = lp_build_isnan(bld, b);
231 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
232 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
233 return lp_build_select(bld, cond, a, b);
234 }
235 break;
236 case GALLIVM_NAN_RETURN_OTHER: {
237 LLVMValueRef isnan = lp_build_isnan(bld, a);
238 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
239 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
240 return lp_build_select(bld, cond, a, b);
241 }
242 break;
243 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
244 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
245 return lp_build_select(bld, cond, a, b);
246 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
247 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
248 return lp_build_select(bld, cond, b, a);
249 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
250 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
251 return lp_build_select(bld, cond, a, b);
252 break;
253 default:
254 assert(0);
255 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
256 return lp_build_select(bld, cond, a, b);
257 }
258 } else {
259 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
260 return lp_build_select(bld, cond, a, b);
261 }
262 }
263
264
265 /**
266 * Generate max(a, b)
267 * No checks for special case values of a or b = 1 or 0 are done.
268 * NaN's are handled according to the behavior specified by the
269 * nan_behavior argument.
270 */
271 static LLVMValueRef
272 lp_build_max_simple(struct lp_build_context *bld,
273 LLVMValueRef a,
274 LLVMValueRef b,
275 enum gallivm_nan_behavior nan_behavior)
276 {
277 const struct lp_type type = bld->type;
278 const char *intrinsic = NULL;
279 unsigned intr_size = 0;
280 LLVMValueRef cond;
281
282 assert(lp_check_value(type, a));
283 assert(lp_check_value(type, b));
284
285 /* TODO: optimize the constant case */
286
287 if (type.floating && util_cpu_caps.has_sse) {
288 if (type.width == 32) {
289 if (type.length == 1) {
290 intrinsic = "llvm.x86.sse.max.ss";
291 intr_size = 128;
292 }
293 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
294 intrinsic = "llvm.x86.sse.max.ps";
295 intr_size = 128;
296 }
297 else {
298 intrinsic = "llvm.x86.avx.max.ps.256";
299 intr_size = 256;
300 }
301 }
302 if (type.width == 64 && util_cpu_caps.has_sse2) {
303 if (type.length == 1) {
304 intrinsic = "llvm.x86.sse2.max.sd";
305 intr_size = 128;
306 }
307 else if (type.length == 2 || !util_cpu_caps.has_avx) {
308 intrinsic = "llvm.x86.sse2.max.pd";
309 intr_size = 128;
310 }
311 else {
312 intrinsic = "llvm.x86.avx.max.pd.256";
313 intr_size = 256;
314 }
315 }
316 }
317 else if (type.floating && util_cpu_caps.has_altivec) {
318 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
319 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
320 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
321 __FUNCTION__);
322 }
323 if (type.width == 32 || type.length == 4) {
324 intrinsic = "llvm.ppc.altivec.vmaxfp";
325 intr_size = 128;
326 }
327 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
328 intr_size = 128;
329 if ((type.width == 8 || type.width == 16) &&
330 (type.width * type.length <= 64) &&
331 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
332 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
333 __FUNCTION__);
334 }
335 if (type.width == 8 && !type.sign) {
336 intrinsic = "llvm.x86.sse2.pmaxu.b";
337 intr_size = 128;
338 }
339 else if (type.width == 16 && type.sign) {
340 intrinsic = "llvm.x86.sse2.pmaxs.w";
341 }
342 if (util_cpu_caps.has_sse4_1) {
343 if (type.width == 8 && type.sign) {
344 intrinsic = "llvm.x86.sse41.pmaxsb";
345 }
346 if (type.width == 16 && !type.sign) {
347 intrinsic = "llvm.x86.sse41.pmaxuw";
348 }
349 if (type.width == 32 && !type.sign) {
350 intrinsic = "llvm.x86.sse41.pmaxud";
351 }
352 if (type.width == 32 && type.sign) {
353 intrinsic = "llvm.x86.sse41.pmaxsd";
354 }
355 }
356 } else if (util_cpu_caps.has_altivec) {
357 intr_size = 128;
358 if (type.width == 8) {
359 if (!type.sign) {
360 intrinsic = "llvm.ppc.altivec.vmaxub";
361 } else {
362 intrinsic = "llvm.ppc.altivec.vmaxsb";
363 }
364 } else if (type.width == 16) {
365 if (!type.sign) {
366 intrinsic = "llvm.ppc.altivec.vmaxuh";
367 } else {
368 intrinsic = "llvm.ppc.altivec.vmaxsh";
369 }
370 } else if (type.width == 32) {
371 if (!type.sign) {
372 intrinsic = "llvm.ppc.altivec.vmaxuw";
373 } else {
374 intrinsic = "llvm.ppc.altivec.vmaxsw";
375 }
376 }
377 }
378
379 if (intrinsic) {
380 if (util_cpu_caps.has_sse && type.floating &&
381 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
382 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
383 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
384 LLVMValueRef isnan, max;
385 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
386 type,
387 intr_size, a, b);
388 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
389 isnan = lp_build_isnan(bld, b);
390 return lp_build_select(bld, isnan, a, max);
391 } else {
392 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
393 isnan = lp_build_isnan(bld, a);
394 return lp_build_select(bld, isnan, a, max);
395 }
396 } else {
397 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
398 type,
399 intr_size, a, b);
400 }
401 }
402
403 if (type.floating) {
404 switch (nan_behavior) {
405 case GALLIVM_NAN_RETURN_NAN: {
406 LLVMValueRef isnan = lp_build_isnan(bld, b);
407 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
408 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
409 return lp_build_select(bld, cond, a, b);
410 }
411 break;
412 case GALLIVM_NAN_RETURN_OTHER: {
413 LLVMValueRef isnan = lp_build_isnan(bld, a);
414 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
415 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
416 return lp_build_select(bld, cond, a, b);
417 }
418 break;
419 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
420 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
421 return lp_build_select(bld, cond, a, b);
422 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
423 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
424 return lp_build_select(bld, cond, b, a);
425 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
426 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
427 return lp_build_select(bld, cond, a, b);
428 break;
429 default:
430 assert(0);
431 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
432 return lp_build_select(bld, cond, a, b);
433 }
434 } else {
435 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
436 return lp_build_select(bld, cond, a, b);
437 }
438 }
439
440
441 /**
442 * Generate 1 - a, or ~a depending on bld->type.
443 */
444 LLVMValueRef
445 lp_build_comp(struct lp_build_context *bld,
446 LLVMValueRef a)
447 {
448 LLVMBuilderRef builder = bld->gallivm->builder;
449 const struct lp_type type = bld->type;
450
451 assert(lp_check_value(type, a));
452
453 if(a == bld->one)
454 return bld->zero;
455 if(a == bld->zero)
456 return bld->one;
457
458 if(type.norm && !type.floating && !type.fixed && !type.sign) {
459 if(LLVMIsConstant(a))
460 return LLVMConstNot(a);
461 else
462 return LLVMBuildNot(builder, a, "");
463 }
464
465 if(LLVMIsConstant(a))
466 if (type.floating)
467 return LLVMConstFSub(bld->one, a);
468 else
469 return LLVMConstSub(bld->one, a);
470 else
471 if (type.floating)
472 return LLVMBuildFSub(builder, bld->one, a, "");
473 else
474 return LLVMBuildSub(builder, bld->one, a, "");
475 }
476
477
478 /**
479 * Generate a + b
480 */
481 LLVMValueRef
482 lp_build_add(struct lp_build_context *bld,
483 LLVMValueRef a,
484 LLVMValueRef b)
485 {
486 LLVMBuilderRef builder = bld->gallivm->builder;
487 const struct lp_type type = bld->type;
488 LLVMValueRef res;
489
490 assert(lp_check_value(type, a));
491 assert(lp_check_value(type, b));
492
493 if(a == bld->zero)
494 return b;
495 if(b == bld->zero)
496 return a;
497 if(a == bld->undef || b == bld->undef)
498 return bld->undef;
499
500 if(bld->type.norm) {
501 const char *intrinsic = NULL;
502
503 if(a == bld->one || b == bld->one)
504 return bld->one;
505
506 if (type.width * type.length == 128 &&
507 !type.floating && !type.fixed) {
508 if(util_cpu_caps.has_sse2) {
509 if(type.width == 8)
510 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
511 if(type.width == 16)
512 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
513 } else if (util_cpu_caps.has_altivec) {
514 if(type.width == 8)
515 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
516 if(type.width == 16)
517 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
518 }
519 }
520
521 if (intrinsic)
522 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
523 }
524
525 if(type.norm && !type.floating && !type.fixed) {
526 if (type.sign) {
527 uint64_t sign = (uint64_t)1 << (type.width - 1);
528 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
529 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
530 /* a_clamp_max is the maximum a for positive b,
531 a_clamp_min is the minimum a for negative b. */
532 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
533 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
534 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
535 } else {
536 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
537 }
538 }
539
540 if(LLVMIsConstant(a) && LLVMIsConstant(b))
541 if (type.floating)
542 res = LLVMConstFAdd(a, b);
543 else
544 res = LLVMConstAdd(a, b);
545 else
546 if (type.floating)
547 res = LLVMBuildFAdd(builder, a, b, "");
548 else
549 res = LLVMBuildAdd(builder, a, b, "");
550
551 /* clamp to ceiling of 1.0 */
552 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
553 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
554
555 /* XXX clamp to floor of -1 or 0??? */
556
557 return res;
558 }
559
560
561 /** Return the scalar sum of the elements of a.
562 * Should avoid this operation whenever possible.
563 */
564 LLVMValueRef
565 lp_build_horizontal_add(struct lp_build_context *bld,
566 LLVMValueRef a)
567 {
568 LLVMBuilderRef builder = bld->gallivm->builder;
569 const struct lp_type type = bld->type;
570 LLVMValueRef index, res;
571 unsigned i, length;
572 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
573 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
574 LLVMValueRef vecres, elem2;
575
576 assert(lp_check_value(type, a));
577
578 if (type.length == 1) {
579 return a;
580 }
581
582 assert(!bld->type.norm);
583
584 /*
585 * for byte vectors can do much better with psadbw.
586 * Using repeated shuffle/adds here. Note with multiple vectors
587 * this can be done more efficiently as outlined in the intel
588 * optimization manual.
589 * Note: could cause data rearrangement if used with smaller element
590 * sizes.
591 */
592
593 vecres = a;
594 length = type.length / 2;
595 while (length > 1) {
596 LLVMValueRef vec1, vec2;
597 for (i = 0; i < length; i++) {
598 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
599 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
600 }
601 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
602 LLVMConstVector(shuffles1, length), "");
603 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
604 LLVMConstVector(shuffles2, length), "");
605 if (type.floating) {
606 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
607 }
608 else {
609 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
610 }
611 length = length >> 1;
612 }
613
614 /* always have vector of size 2 here */
615 assert(length == 1);
616
617 index = lp_build_const_int32(bld->gallivm, 0);
618 res = LLVMBuildExtractElement(builder, vecres, index, "");
619 index = lp_build_const_int32(bld->gallivm, 1);
620 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
621
622 if (type.floating)
623 res = LLVMBuildFAdd(builder, res, elem2, "");
624 else
625 res = LLVMBuildAdd(builder, res, elem2, "");
626
627 return res;
628 }
629
630 /**
631 * Return the horizontal sums of 4 float vectors as a float4 vector.
632 * This uses the technique as outlined in Intel Optimization Manual.
633 */
634 static LLVMValueRef
635 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
636 LLVMValueRef src[4])
637 {
638 struct gallivm_state *gallivm = bld->gallivm;
639 LLVMBuilderRef builder = gallivm->builder;
640 LLVMValueRef shuffles[4];
641 LLVMValueRef tmp[4];
642 LLVMValueRef sumtmp[2], shuftmp[2];
643
644 /* lower half of regs */
645 shuffles[0] = lp_build_const_int32(gallivm, 0);
646 shuffles[1] = lp_build_const_int32(gallivm, 1);
647 shuffles[2] = lp_build_const_int32(gallivm, 4);
648 shuffles[3] = lp_build_const_int32(gallivm, 5);
649 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
650 LLVMConstVector(shuffles, 4), "");
651 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
652 LLVMConstVector(shuffles, 4), "");
653
654 /* upper half of regs */
655 shuffles[0] = lp_build_const_int32(gallivm, 2);
656 shuffles[1] = lp_build_const_int32(gallivm, 3);
657 shuffles[2] = lp_build_const_int32(gallivm, 6);
658 shuffles[3] = lp_build_const_int32(gallivm, 7);
659 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
660 LLVMConstVector(shuffles, 4), "");
661 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
662 LLVMConstVector(shuffles, 4), "");
663
664 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
665 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
666
667 shuffles[0] = lp_build_const_int32(gallivm, 0);
668 shuffles[1] = lp_build_const_int32(gallivm, 2);
669 shuffles[2] = lp_build_const_int32(gallivm, 4);
670 shuffles[3] = lp_build_const_int32(gallivm, 6);
671 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
672 LLVMConstVector(shuffles, 4), "");
673
674 shuffles[0] = lp_build_const_int32(gallivm, 1);
675 shuffles[1] = lp_build_const_int32(gallivm, 3);
676 shuffles[2] = lp_build_const_int32(gallivm, 5);
677 shuffles[3] = lp_build_const_int32(gallivm, 7);
678 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
679 LLVMConstVector(shuffles, 4), "");
680
681 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
682 }
683
684
685 /*
686 * partially horizontally add 2-4 float vectors with length nx4,
687 * i.e. only four adjacent values in each vector will be added,
688 * assuming values are really grouped in 4 which also determines
689 * output order.
690 *
691 * Return a vector of the same length as the initial vectors,
692 * with the excess elements (if any) being undefined.
693 * The element order is independent of number of input vectors.
694 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
695 * the output order thus will be
696 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
697 */
698 LLVMValueRef
699 lp_build_hadd_partial4(struct lp_build_context *bld,
700 LLVMValueRef vectors[],
701 unsigned num_vecs)
702 {
703 struct gallivm_state *gallivm = bld->gallivm;
704 LLVMBuilderRef builder = gallivm->builder;
705 LLVMValueRef ret_vec;
706 LLVMValueRef tmp[4];
707 const char *intrinsic = NULL;
708
709 assert(num_vecs >= 2 && num_vecs <= 4);
710 assert(bld->type.floating);
711
712 /* only use this with at least 2 vectors, as it is sort of expensive
713 * (depending on cpu) and we always need two horizontal adds anyway,
714 * so a shuffle/add approach might be better.
715 */
716
717 tmp[0] = vectors[0];
718 tmp[1] = vectors[1];
719
720 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
721 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
722
723 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
724 bld->type.length == 4) {
725 intrinsic = "llvm.x86.sse3.hadd.ps";
726 }
727 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
728 bld->type.length == 8) {
729 intrinsic = "llvm.x86.avx.hadd.ps.256";
730 }
731 if (intrinsic) {
732 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
733 lp_build_vec_type(gallivm, bld->type),
734 tmp[0], tmp[1]);
735 if (num_vecs > 2) {
736 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
737 lp_build_vec_type(gallivm, bld->type),
738 tmp[2], tmp[3]);
739 }
740 else {
741 tmp[1] = tmp[0];
742 }
743 return lp_build_intrinsic_binary(builder, intrinsic,
744 lp_build_vec_type(gallivm, bld->type),
745 tmp[0], tmp[1]);
746 }
747
748 if (bld->type.length == 4) {
749 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
750 }
751 else {
752 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
753 unsigned j;
754 unsigned num_iter = bld->type.length / 4;
755 struct lp_type parttype = bld->type;
756 parttype.length = 4;
757 for (j = 0; j < num_iter; j++) {
758 LLVMValueRef partsrc[4];
759 unsigned i;
760 for (i = 0; i < 4; i++) {
761 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
762 }
763 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
764 }
765 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
766 }
767 return ret_vec;
768 }
769
770 /**
771 * Generate a - b
772 */
773 LLVMValueRef
774 lp_build_sub(struct lp_build_context *bld,
775 LLVMValueRef a,
776 LLVMValueRef b)
777 {
778 LLVMBuilderRef builder = bld->gallivm->builder;
779 const struct lp_type type = bld->type;
780 LLVMValueRef res;
781
782 assert(lp_check_value(type, a));
783 assert(lp_check_value(type, b));
784
785 if(b == bld->zero)
786 return a;
787 if(a == bld->undef || b == bld->undef)
788 return bld->undef;
789 if(a == b)
790 return bld->zero;
791
792 if(bld->type.norm) {
793 const char *intrinsic = NULL;
794
795 if(b == bld->one)
796 return bld->zero;
797
798 if (type.width * type.length == 128 &&
799 !type.floating && !type.fixed) {
800 if (util_cpu_caps.has_sse2) {
801 if(type.width == 8)
802 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
803 if(type.width == 16)
804 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
805 } else if (util_cpu_caps.has_altivec) {
806 if(type.width == 8)
807 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
808 if(type.width == 16)
809 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
810 }
811 }
812
813 if (intrinsic)
814 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
815 }
816
817 if(type.norm && !type.floating && !type.fixed) {
818 if (type.sign) {
819 uint64_t sign = (uint64_t)1 << (type.width - 1);
820 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
821 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
822 /* a_clamp_max is the maximum a for negative b,
823 a_clamp_min is the minimum a for positive b. */
824 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
825 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
826 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
827 } else {
828 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
829 }
830 }
831
832 if(LLVMIsConstant(a) && LLVMIsConstant(b))
833 if (type.floating)
834 res = LLVMConstFSub(a, b);
835 else
836 res = LLVMConstSub(a, b);
837 else
838 if (type.floating)
839 res = LLVMBuildFSub(builder, a, b, "");
840 else
841 res = LLVMBuildSub(builder, a, b, "");
842
843 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
844 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
845
846 return res;
847 }
848
849
850
851 /**
852 * Normalized multiplication.
853 *
854 * There are several approaches for (using 8-bit normalized multiplication as
855 * an example):
856 *
857 * - alpha plus one
858 *
859 * makes the following approximation to the division (Sree)
860 *
861 * a*b/255 ~= (a*(b + 1)) >> 256
862 *
863 * which is the fastest method that satisfies the following OpenGL criteria of
864 *
865 * 0*0 = 0 and 255*255 = 255
866 *
867 * - geometric series
868 *
869 * takes the geometric series approximation to the division
870 *
871 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
872 *
873 * in this case just the first two terms to fit in 16bit arithmetic
874 *
875 * t/255 ~= (t + (t >> 8)) >> 8
876 *
877 * note that just by itself it doesn't satisfies the OpenGL criteria, as
878 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
879 * must be used.
880 *
881 * - geometric series plus rounding
882 *
883 * when using a geometric series division instead of truncating the result
884 * use roundoff in the approximation (Jim Blinn)
885 *
886 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
887 *
888 * achieving the exact results.
889 *
890 *
891 *
892 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
893 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
894 * @sa Michael Herf, The "double blend trick", May 2000,
895 * http://www.stereopsis.com/doubleblend.html
896 */
897 static LLVMValueRef
898 lp_build_mul_norm(struct gallivm_state *gallivm,
899 struct lp_type wide_type,
900 LLVMValueRef a, LLVMValueRef b)
901 {
902 LLVMBuilderRef builder = gallivm->builder;
903 struct lp_build_context bld;
904 unsigned n;
905 LLVMValueRef half;
906 LLVMValueRef ab;
907
908 assert(!wide_type.floating);
909 assert(lp_check_value(wide_type, a));
910 assert(lp_check_value(wide_type, b));
911
912 lp_build_context_init(&bld, gallivm, wide_type);
913
914 n = wide_type.width / 2;
915 if (wide_type.sign) {
916 --n;
917 }
918
919 /*
920 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
921 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
922 */
923
924 /*
925 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
926 */
927
928 ab = LLVMBuildMul(builder, a, b, "");
929 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
930
931 /*
932 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
933 */
934
935 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
936 if (wide_type.sign) {
937 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
938 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
939 half = lp_build_select(&bld, sign, minus_half, half);
940 }
941 ab = LLVMBuildAdd(builder, ab, half, "");
942
943 /* Final division */
944 ab = lp_build_shr_imm(&bld, ab, n);
945
946 return ab;
947 }
948
949 /**
950 * Generate a * b
951 */
952 LLVMValueRef
953 lp_build_mul(struct lp_build_context *bld,
954 LLVMValueRef a,
955 LLVMValueRef b)
956 {
957 LLVMBuilderRef builder = bld->gallivm->builder;
958 const struct lp_type type = bld->type;
959 LLVMValueRef shift;
960 LLVMValueRef res;
961
962 assert(lp_check_value(type, a));
963 assert(lp_check_value(type, b));
964
965 if(a == bld->zero)
966 return bld->zero;
967 if(a == bld->one)
968 return b;
969 if(b == bld->zero)
970 return bld->zero;
971 if(b == bld->one)
972 return a;
973 if(a == bld->undef || b == bld->undef)
974 return bld->undef;
975
976 if (!type.floating && !type.fixed && type.norm) {
977 struct lp_type wide_type = lp_wider_type(type);
978 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
979
980 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
981 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
982
983 /* PMULLW, PSRLW, PADDW */
984 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
985 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
986
987 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
988
989 return ab;
990 }
991
992 if(type.fixed)
993 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
994 else
995 shift = NULL;
996
997 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
998 if (type.floating)
999 res = LLVMConstFMul(a, b);
1000 else
1001 res = LLVMConstMul(a, b);
1002 if(shift) {
1003 if(type.sign)
1004 res = LLVMConstAShr(res, shift);
1005 else
1006 res = LLVMConstLShr(res, shift);
1007 }
1008 }
1009 else {
1010 if (type.floating)
1011 res = LLVMBuildFMul(builder, a, b, "");
1012 else
1013 res = LLVMBuildMul(builder, a, b, "");
1014 if(shift) {
1015 if(type.sign)
1016 res = LLVMBuildAShr(builder, res, shift, "");
1017 else
1018 res = LLVMBuildLShr(builder, res, shift, "");
1019 }
1020 }
1021
1022 return res;
1023 }
1024
1025
1026 /**
1027 * Small vector x scale multiplication optimization.
1028 */
1029 LLVMValueRef
1030 lp_build_mul_imm(struct lp_build_context *bld,
1031 LLVMValueRef a,
1032 int b)
1033 {
1034 LLVMBuilderRef builder = bld->gallivm->builder;
1035 LLVMValueRef factor;
1036
1037 assert(lp_check_value(bld->type, a));
1038
1039 if(b == 0)
1040 return bld->zero;
1041
1042 if(b == 1)
1043 return a;
1044
1045 if(b == -1)
1046 return lp_build_negate(bld, a);
1047
1048 if(b == 2 && bld->type.floating)
1049 return lp_build_add(bld, a, a);
1050
1051 if(util_is_power_of_two(b)) {
1052 unsigned shift = ffs(b) - 1;
1053
1054 if(bld->type.floating) {
1055 #if 0
1056 /*
1057 * Power of two multiplication by directly manipulating the exponent.
1058 *
1059 * XXX: This might not be always faster, it will introduce a small error
1060 * for multiplication by zero, and it will produce wrong results
1061 * for Inf and NaN.
1062 */
1063 unsigned mantissa = lp_mantissa(bld->type);
1064 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1065 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1066 a = LLVMBuildAdd(builder, a, factor, "");
1067 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1068 return a;
1069 #endif
1070 }
1071 else {
1072 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1073 return LLVMBuildShl(builder, a, factor, "");
1074 }
1075 }
1076
1077 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1078 return lp_build_mul(bld, a, factor);
1079 }
1080
1081
1082 /**
1083 * Generate a / b
1084 */
1085 LLVMValueRef
1086 lp_build_div(struct lp_build_context *bld,
1087 LLVMValueRef a,
1088 LLVMValueRef b)
1089 {
1090 LLVMBuilderRef builder = bld->gallivm->builder;
1091 const struct lp_type type = bld->type;
1092
1093 assert(lp_check_value(type, a));
1094 assert(lp_check_value(type, b));
1095
1096 if(a == bld->zero)
1097 return bld->zero;
1098 if(a == bld->one && type.floating)
1099 return lp_build_rcp(bld, b);
1100 if(b == bld->zero)
1101 return bld->undef;
1102 if(b == bld->one)
1103 return a;
1104 if(a == bld->undef || b == bld->undef)
1105 return bld->undef;
1106
1107 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1108 if (type.floating)
1109 return LLVMConstFDiv(a, b);
1110 else if (type.sign)
1111 return LLVMConstSDiv(a, b);
1112 else
1113 return LLVMConstUDiv(a, b);
1114 }
1115
1116 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1117 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1118 type.floating)
1119 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1120
1121 if (type.floating)
1122 return LLVMBuildFDiv(builder, a, b, "");
1123 else if (type.sign)
1124 return LLVMBuildSDiv(builder, a, b, "");
1125 else
1126 return LLVMBuildUDiv(builder, a, b, "");
1127 }
1128
1129
1130 /**
1131 * Linear interpolation helper.
1132 *
1133 * @param normalized whether we are interpolating normalized values,
1134 * encoded in normalized integers, twice as wide.
1135 *
1136 * @sa http://www.stereopsis.com/doubleblend.html
1137 */
1138 static inline LLVMValueRef
1139 lp_build_lerp_simple(struct lp_build_context *bld,
1140 LLVMValueRef x,
1141 LLVMValueRef v0,
1142 LLVMValueRef v1,
1143 unsigned flags)
1144 {
1145 unsigned half_width = bld->type.width/2;
1146 LLVMBuilderRef builder = bld->gallivm->builder;
1147 LLVMValueRef delta;
1148 LLVMValueRef res;
1149
1150 assert(lp_check_value(bld->type, x));
1151 assert(lp_check_value(bld->type, v0));
1152 assert(lp_check_value(bld->type, v1));
1153
1154 delta = lp_build_sub(bld, v1, v0);
1155
1156 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1157 if (!bld->type.sign) {
1158 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1159 /*
1160 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1161 * most-significant-bit to the lowest-significant-bit, so that
1162 * later we can just divide by 2**n instead of 2**n - 1.
1163 */
1164
1165 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1166 }
1167
1168 /* (x * delta) >> n */
1169 res = lp_build_mul(bld, x, delta);
1170 res = lp_build_shr_imm(bld, res, half_width);
1171 } else {
1172 /*
1173 * The rescaling trick above doesn't work for signed numbers, so
1174 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1175 * instead.
1176 */
1177 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1178 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1179 }
1180 } else {
1181 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1182 res = lp_build_mul(bld, x, delta);
1183 }
1184
1185 res = lp_build_add(bld, v0, res);
1186
1187 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1188 bld->type.fixed) {
1189 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1190 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1191 * but it will be wrong for true fixed point use cases. Basically we need
1192 * a more powerful lp_type, capable of further distinguishing the values
1193 * interpretation from the value storage. */
1194 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1195 }
1196
1197 return res;
1198 }
1199
1200
1201 /**
1202 * Linear interpolation.
1203 */
1204 LLVMValueRef
1205 lp_build_lerp(struct lp_build_context *bld,
1206 LLVMValueRef x,
1207 LLVMValueRef v0,
1208 LLVMValueRef v1,
1209 unsigned flags)
1210 {
1211 const struct lp_type type = bld->type;
1212 LLVMValueRef res;
1213
1214 assert(lp_check_value(type, x));
1215 assert(lp_check_value(type, v0));
1216 assert(lp_check_value(type, v1));
1217
1218 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1219
1220 if (type.norm) {
1221 struct lp_type wide_type;
1222 struct lp_build_context wide_bld;
1223 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1224
1225 assert(type.length >= 2);
1226
1227 /*
1228 * Create a wider integer type, enough to hold the
1229 * intermediate result of the multiplication.
1230 */
1231 memset(&wide_type, 0, sizeof wide_type);
1232 wide_type.sign = type.sign;
1233 wide_type.width = type.width*2;
1234 wide_type.length = type.length/2;
1235
1236 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1237
1238 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1239 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1240 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1241
1242 /*
1243 * Lerp both halves.
1244 */
1245
1246 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1247
1248 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1249 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1250
1251 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1252 } else {
1253 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1254 }
1255
1256 return res;
1257 }
1258
1259
1260 /**
1261 * Bilinear interpolation.
1262 *
1263 * Values indices are in v_{yx}.
1264 */
1265 LLVMValueRef
1266 lp_build_lerp_2d(struct lp_build_context *bld,
1267 LLVMValueRef x,
1268 LLVMValueRef y,
1269 LLVMValueRef v00,
1270 LLVMValueRef v01,
1271 LLVMValueRef v10,
1272 LLVMValueRef v11,
1273 unsigned flags)
1274 {
1275 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1276 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1277 return lp_build_lerp(bld, y, v0, v1, flags);
1278 }
1279
1280
1281 LLVMValueRef
1282 lp_build_lerp_3d(struct lp_build_context *bld,
1283 LLVMValueRef x,
1284 LLVMValueRef y,
1285 LLVMValueRef z,
1286 LLVMValueRef v000,
1287 LLVMValueRef v001,
1288 LLVMValueRef v010,
1289 LLVMValueRef v011,
1290 LLVMValueRef v100,
1291 LLVMValueRef v101,
1292 LLVMValueRef v110,
1293 LLVMValueRef v111,
1294 unsigned flags)
1295 {
1296 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1297 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1298 return lp_build_lerp(bld, z, v0, v1, flags);
1299 }
1300
1301
1302 /**
1303 * Generate min(a, b)
1304 * Do checks for special cases but not for nans.
1305 */
1306 LLVMValueRef
1307 lp_build_min(struct lp_build_context *bld,
1308 LLVMValueRef a,
1309 LLVMValueRef b)
1310 {
1311 assert(lp_check_value(bld->type, a));
1312 assert(lp_check_value(bld->type, b));
1313
1314 if(a == bld->undef || b == bld->undef)
1315 return bld->undef;
1316
1317 if(a == b)
1318 return a;
1319
1320 if (bld->type.norm) {
1321 if (!bld->type.sign) {
1322 if (a == bld->zero || b == bld->zero) {
1323 return bld->zero;
1324 }
1325 }
1326 if(a == bld->one)
1327 return b;
1328 if(b == bld->one)
1329 return a;
1330 }
1331
1332 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1333 }
1334
1335
1336 /**
1337 * Generate min(a, b)
1338 * NaN's are handled according to the behavior specified by the
1339 * nan_behavior argument.
1340 */
1341 LLVMValueRef
1342 lp_build_min_ext(struct lp_build_context *bld,
1343 LLVMValueRef a,
1344 LLVMValueRef b,
1345 enum gallivm_nan_behavior nan_behavior)
1346 {
1347 assert(lp_check_value(bld->type, a));
1348 assert(lp_check_value(bld->type, b));
1349
1350 if(a == bld->undef || b == bld->undef)
1351 return bld->undef;
1352
1353 if(a == b)
1354 return a;
1355
1356 if (bld->type.norm) {
1357 if (!bld->type.sign) {
1358 if (a == bld->zero || b == bld->zero) {
1359 return bld->zero;
1360 }
1361 }
1362 if(a == bld->one)
1363 return b;
1364 if(b == bld->one)
1365 return a;
1366 }
1367
1368 return lp_build_min_simple(bld, a, b, nan_behavior);
1369 }
1370
1371 /**
1372 * Generate max(a, b)
1373 * Do checks for special cases, but NaN behavior is undefined.
1374 */
1375 LLVMValueRef
1376 lp_build_max(struct lp_build_context *bld,
1377 LLVMValueRef a,
1378 LLVMValueRef b)
1379 {
1380 assert(lp_check_value(bld->type, a));
1381 assert(lp_check_value(bld->type, b));
1382
1383 if(a == bld->undef || b == bld->undef)
1384 return bld->undef;
1385
1386 if(a == b)
1387 return a;
1388
1389 if(bld->type.norm) {
1390 if(a == bld->one || b == bld->one)
1391 return bld->one;
1392 if (!bld->type.sign) {
1393 if (a == bld->zero) {
1394 return b;
1395 }
1396 if (b == bld->zero) {
1397 return a;
1398 }
1399 }
1400 }
1401
1402 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1403 }
1404
1405
1406 /**
1407 * Generate max(a, b)
1408 * Checks for special cases.
1409 * NaN's are handled according to the behavior specified by the
1410 * nan_behavior argument.
1411 */
1412 LLVMValueRef
1413 lp_build_max_ext(struct lp_build_context *bld,
1414 LLVMValueRef a,
1415 LLVMValueRef b,
1416 enum gallivm_nan_behavior nan_behavior)
1417 {
1418 assert(lp_check_value(bld->type, a));
1419 assert(lp_check_value(bld->type, b));
1420
1421 if(a == bld->undef || b == bld->undef)
1422 return bld->undef;
1423
1424 if(a == b)
1425 return a;
1426
1427 if(bld->type.norm) {
1428 if(a == bld->one || b == bld->one)
1429 return bld->one;
1430 if (!bld->type.sign) {
1431 if (a == bld->zero) {
1432 return b;
1433 }
1434 if (b == bld->zero) {
1435 return a;
1436 }
1437 }
1438 }
1439
1440 return lp_build_max_simple(bld, a, b, nan_behavior);
1441 }
1442
1443 /**
1444 * Generate clamp(a, min, max)
1445 * NaN behavior (for any of a, min, max) is undefined.
1446 * Do checks for special cases.
1447 */
1448 LLVMValueRef
1449 lp_build_clamp(struct lp_build_context *bld,
1450 LLVMValueRef a,
1451 LLVMValueRef min,
1452 LLVMValueRef max)
1453 {
1454 assert(lp_check_value(bld->type, a));
1455 assert(lp_check_value(bld->type, min));
1456 assert(lp_check_value(bld->type, max));
1457
1458 a = lp_build_min(bld, a, max);
1459 a = lp_build_max(bld, a, min);
1460 return a;
1461 }
1462
1463
1464 /**
1465 * Generate clamp(a, 0, 1)
1466 * A NaN will get converted to zero.
1467 */
1468 LLVMValueRef
1469 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1470 LLVMValueRef a)
1471 {
1472 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1473 a = lp_build_min(bld, a, bld->one);
1474 return a;
1475 }
1476
1477
1478 /**
1479 * Generate abs(a)
1480 */
1481 LLVMValueRef
1482 lp_build_abs(struct lp_build_context *bld,
1483 LLVMValueRef a)
1484 {
1485 LLVMBuilderRef builder = bld->gallivm->builder;
1486 const struct lp_type type = bld->type;
1487 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1488
1489 assert(lp_check_value(type, a));
1490
1491 if(!type.sign)
1492 return a;
1493
1494 if(type.floating) {
1495 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1496 /* Workaround llvm.org/PR27332 */
1497 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1498 unsigned long long absMask = ~(1ULL << (type.width - 1));
1499 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1500 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1501 a = LLVMBuildAnd(builder, a, mask, "");
1502 a = LLVMBuildBitCast(builder, a, vec_type, "");
1503 return a;
1504 } else {
1505 char intrinsic[32];
1506 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1507 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1508 }
1509 }
1510
1511 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1512 switch(type.width) {
1513 case 8:
1514 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1515 case 16:
1516 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1517 case 32:
1518 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1519 }
1520 }
1521 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1522 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1523 (type.width == 8 || type.width == 16 || type.width == 32)) {
1524 debug_printf("%s: inefficient code, should split vectors manually\n",
1525 __FUNCTION__);
1526 }
1527
1528 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1529 }
1530
1531
1532 LLVMValueRef
1533 lp_build_negate(struct lp_build_context *bld,
1534 LLVMValueRef a)
1535 {
1536 LLVMBuilderRef builder = bld->gallivm->builder;
1537
1538 assert(lp_check_value(bld->type, a));
1539
1540 if (bld->type.floating)
1541 a = LLVMBuildFNeg(builder, a, "");
1542 else
1543 a = LLVMBuildNeg(builder, a, "");
1544
1545 return a;
1546 }
1547
1548
1549 /** Return -1, 0 or +1 depending on the sign of a */
1550 LLVMValueRef
1551 lp_build_sgn(struct lp_build_context *bld,
1552 LLVMValueRef a)
1553 {
1554 LLVMBuilderRef builder = bld->gallivm->builder;
1555 const struct lp_type type = bld->type;
1556 LLVMValueRef cond;
1557 LLVMValueRef res;
1558
1559 assert(lp_check_value(type, a));
1560
1561 /* Handle non-zero case */
1562 if(!type.sign) {
1563 /* if not zero then sign must be positive */
1564 res = bld->one;
1565 }
1566 else if(type.floating) {
1567 LLVMTypeRef vec_type;
1568 LLVMTypeRef int_type;
1569 LLVMValueRef mask;
1570 LLVMValueRef sign;
1571 LLVMValueRef one;
1572 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1573
1574 int_type = lp_build_int_vec_type(bld->gallivm, type);
1575 vec_type = lp_build_vec_type(bld->gallivm, type);
1576 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1577
1578 /* Take the sign bit and add it to 1 constant */
1579 sign = LLVMBuildBitCast(builder, a, int_type, "");
1580 sign = LLVMBuildAnd(builder, sign, mask, "");
1581 one = LLVMConstBitCast(bld->one, int_type);
1582 res = LLVMBuildOr(builder, sign, one, "");
1583 res = LLVMBuildBitCast(builder, res, vec_type, "");
1584 }
1585 else
1586 {
1587 /* signed int/norm/fixed point */
1588 /* could use psign with sse3 and appropriate vectors here */
1589 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1590 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1591 res = lp_build_select(bld, cond, bld->one, minus_one);
1592 }
1593
1594 /* Handle zero */
1595 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1596 res = lp_build_select(bld, cond, bld->zero, res);
1597
1598 return res;
1599 }
1600
1601
1602 /**
1603 * Set the sign of float vector 'a' according to 'sign'.
1604 * If sign==0, return abs(a).
1605 * If sign==1, return -abs(a);
1606 * Other values for sign produce undefined results.
1607 */
1608 LLVMValueRef
1609 lp_build_set_sign(struct lp_build_context *bld,
1610 LLVMValueRef a, LLVMValueRef sign)
1611 {
1612 LLVMBuilderRef builder = bld->gallivm->builder;
1613 const struct lp_type type = bld->type;
1614 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1615 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1616 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1617 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1618 ~((unsigned long long) 1 << (type.width - 1)));
1619 LLVMValueRef val, res;
1620
1621 assert(type.floating);
1622 assert(lp_check_value(type, a));
1623
1624 /* val = reinterpret_cast<int>(a) */
1625 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1626 /* val = val & mask */
1627 val = LLVMBuildAnd(builder, val, mask, "");
1628 /* sign = sign << shift */
1629 sign = LLVMBuildShl(builder, sign, shift, "");
1630 /* res = val | sign */
1631 res = LLVMBuildOr(builder, val, sign, "");
1632 /* res = reinterpret_cast<float>(res) */
1633 res = LLVMBuildBitCast(builder, res, vec_type, "");
1634
1635 return res;
1636 }
1637
1638
1639 /**
1640 * Convert vector of (or scalar) int to vector of (or scalar) float.
1641 */
1642 LLVMValueRef
1643 lp_build_int_to_float(struct lp_build_context *bld,
1644 LLVMValueRef a)
1645 {
1646 LLVMBuilderRef builder = bld->gallivm->builder;
1647 const struct lp_type type = bld->type;
1648 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1649
1650 assert(type.floating);
1651
1652 return LLVMBuildSIToFP(builder, a, vec_type, "");
1653 }
1654
1655 static boolean
1656 arch_rounding_available(const struct lp_type type)
1657 {
1658 if ((util_cpu_caps.has_sse4_1 &&
1659 (type.length == 1 || type.width*type.length == 128)) ||
1660 (util_cpu_caps.has_avx && type.width*type.length == 256))
1661 return TRUE;
1662 else if ((util_cpu_caps.has_altivec &&
1663 (type.width == 32 && type.length == 4)))
1664 return TRUE;
1665
1666 return FALSE;
1667 }
1668
1669 enum lp_build_round_mode
1670 {
1671 LP_BUILD_ROUND_NEAREST = 0,
1672 LP_BUILD_ROUND_FLOOR = 1,
1673 LP_BUILD_ROUND_CEIL = 2,
1674 LP_BUILD_ROUND_TRUNCATE = 3
1675 };
1676
1677 static inline LLVMValueRef
1678 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1679 LLVMValueRef a)
1680 {
1681 LLVMBuilderRef builder = bld->gallivm->builder;
1682 const struct lp_type type = bld->type;
1683 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1684 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1685 const char *intrinsic;
1686 LLVMValueRef res;
1687
1688 assert(type.floating);
1689 /* using the double precision conversions is a bit more complicated */
1690 assert(type.width == 32);
1691
1692 assert(lp_check_value(type, a));
1693 assert(util_cpu_caps.has_sse2);
1694
1695 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1696 if (type.length == 1) {
1697 LLVMTypeRef vec_type;
1698 LLVMValueRef undef;
1699 LLVMValueRef arg;
1700 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1701
1702 vec_type = LLVMVectorType(bld->elem_type, 4);
1703
1704 intrinsic = "llvm.x86.sse.cvtss2si";
1705
1706 undef = LLVMGetUndef(vec_type);
1707
1708 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1709
1710 res = lp_build_intrinsic_unary(builder, intrinsic,
1711 ret_type, arg);
1712 }
1713 else {
1714 if (type.width* type.length == 128) {
1715 intrinsic = "llvm.x86.sse2.cvtps2dq";
1716 }
1717 else {
1718 assert(type.width*type.length == 256);
1719 assert(util_cpu_caps.has_avx);
1720
1721 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1722 }
1723 res = lp_build_intrinsic_unary(builder, intrinsic,
1724 ret_type, a);
1725 }
1726
1727 return res;
1728 }
1729
1730
1731 /*
1732 */
1733 static inline LLVMValueRef
1734 lp_build_round_altivec(struct lp_build_context *bld,
1735 LLVMValueRef a,
1736 enum lp_build_round_mode mode)
1737 {
1738 LLVMBuilderRef builder = bld->gallivm->builder;
1739 const struct lp_type type = bld->type;
1740 const char *intrinsic = NULL;
1741
1742 assert(type.floating);
1743
1744 assert(lp_check_value(type, a));
1745 assert(util_cpu_caps.has_altivec);
1746
1747 (void)type;
1748
1749 switch (mode) {
1750 case LP_BUILD_ROUND_NEAREST:
1751 intrinsic = "llvm.ppc.altivec.vrfin";
1752 break;
1753 case LP_BUILD_ROUND_FLOOR:
1754 intrinsic = "llvm.ppc.altivec.vrfim";
1755 break;
1756 case LP_BUILD_ROUND_CEIL:
1757 intrinsic = "llvm.ppc.altivec.vrfip";
1758 break;
1759 case LP_BUILD_ROUND_TRUNCATE:
1760 intrinsic = "llvm.ppc.altivec.vrfiz";
1761 break;
1762 }
1763
1764 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1765 }
1766
1767 static inline LLVMValueRef
1768 lp_build_round_arch(struct lp_build_context *bld,
1769 LLVMValueRef a,
1770 enum lp_build_round_mode mode)
1771 {
1772 if (util_cpu_caps.has_sse4_1) {
1773 LLVMBuilderRef builder = bld->gallivm->builder;
1774 const struct lp_type type = bld->type;
1775 const char *intrinsic_root;
1776 char intrinsic[32];
1777
1778 assert(type.floating);
1779 assert(lp_check_value(type, a));
1780 (void)type;
1781
1782 switch (mode) {
1783 case LP_BUILD_ROUND_NEAREST:
1784 intrinsic_root = "llvm.nearbyint";
1785 break;
1786 case LP_BUILD_ROUND_FLOOR:
1787 intrinsic_root = "llvm.floor";
1788 break;
1789 case LP_BUILD_ROUND_CEIL:
1790 intrinsic_root = "llvm.ceil";
1791 break;
1792 case LP_BUILD_ROUND_TRUNCATE:
1793 intrinsic_root = "llvm.trunc";
1794 break;
1795 }
1796
1797 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1798 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1799 }
1800 else /* (util_cpu_caps.has_altivec) */
1801 return lp_build_round_altivec(bld, a, mode);
1802 }
1803
1804 /**
1805 * Return the integer part of a float (vector) value (== round toward zero).
1806 * The returned value is a float (vector).
1807 * Ex: trunc(-1.5) = -1.0
1808 */
1809 LLVMValueRef
1810 lp_build_trunc(struct lp_build_context *bld,
1811 LLVMValueRef a)
1812 {
1813 LLVMBuilderRef builder = bld->gallivm->builder;
1814 const struct lp_type type = bld->type;
1815
1816 assert(type.floating);
1817 assert(lp_check_value(type, a));
1818
1819 if (arch_rounding_available(type)) {
1820 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1821 }
1822 else {
1823 const struct lp_type type = bld->type;
1824 struct lp_type inttype;
1825 struct lp_build_context intbld;
1826 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1827 LLVMValueRef trunc, res, anosign, mask;
1828 LLVMTypeRef int_vec_type = bld->int_vec_type;
1829 LLVMTypeRef vec_type = bld->vec_type;
1830
1831 assert(type.width == 32); /* might want to handle doubles at some point */
1832
1833 inttype = type;
1834 inttype.floating = 0;
1835 lp_build_context_init(&intbld, bld->gallivm, inttype);
1836
1837 /* round by truncation */
1838 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1839 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1840
1841 /* mask out sign bit */
1842 anosign = lp_build_abs(bld, a);
1843 /*
1844 * mask out all values if anosign > 2^24
1845 * This should work both for large ints (all rounding is no-op for them
1846 * because such floats are always exact) as well as special cases like
1847 * NaNs, Infs (taking advantage of the fact they use max exponent).
1848 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1849 */
1850 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1851 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1852 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1853 return lp_build_select(bld, mask, a, res);
1854 }
1855 }
1856
1857
1858 /**
1859 * Return float (vector) rounded to nearest integer (vector). The returned
1860 * value is a float (vector).
1861 * Ex: round(0.9) = 1.0
1862 * Ex: round(-1.5) = -2.0
1863 */
1864 LLVMValueRef
1865 lp_build_round(struct lp_build_context *bld,
1866 LLVMValueRef a)
1867 {
1868 LLVMBuilderRef builder = bld->gallivm->builder;
1869 const struct lp_type type = bld->type;
1870
1871 assert(type.floating);
1872 assert(lp_check_value(type, a));
1873
1874 if (arch_rounding_available(type)) {
1875 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1876 }
1877 else {
1878 const struct lp_type type = bld->type;
1879 struct lp_type inttype;
1880 struct lp_build_context intbld;
1881 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1882 LLVMValueRef res, anosign, mask;
1883 LLVMTypeRef int_vec_type = bld->int_vec_type;
1884 LLVMTypeRef vec_type = bld->vec_type;
1885
1886 assert(type.width == 32); /* might want to handle doubles at some point */
1887
1888 inttype = type;
1889 inttype.floating = 0;
1890 lp_build_context_init(&intbld, bld->gallivm, inttype);
1891
1892 res = lp_build_iround(bld, a);
1893 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1894
1895 /* mask out sign bit */
1896 anosign = lp_build_abs(bld, a);
1897 /*
1898 * mask out all values if anosign > 2^24
1899 * This should work both for large ints (all rounding is no-op for them
1900 * because such floats are always exact) as well as special cases like
1901 * NaNs, Infs (taking advantage of the fact they use max exponent).
1902 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1903 */
1904 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1905 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1906 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1907 return lp_build_select(bld, mask, a, res);
1908 }
1909 }
1910
1911
1912 /**
1913 * Return floor of float (vector), result is a float (vector)
1914 * Ex: floor(1.1) = 1.0
1915 * Ex: floor(-1.1) = -2.0
1916 */
1917 LLVMValueRef
1918 lp_build_floor(struct lp_build_context *bld,
1919 LLVMValueRef a)
1920 {
1921 LLVMBuilderRef builder = bld->gallivm->builder;
1922 const struct lp_type type = bld->type;
1923
1924 assert(type.floating);
1925 assert(lp_check_value(type, a));
1926
1927 if (arch_rounding_available(type)) {
1928 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1929 }
1930 else {
1931 const struct lp_type type = bld->type;
1932 struct lp_type inttype;
1933 struct lp_build_context intbld;
1934 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1935 LLVMValueRef trunc, res, anosign, mask;
1936 LLVMTypeRef int_vec_type = bld->int_vec_type;
1937 LLVMTypeRef vec_type = bld->vec_type;
1938
1939 if (type.width != 32) {
1940 char intrinsic[32];
1941 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
1942 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1943 }
1944
1945 assert(type.width == 32); /* might want to handle doubles at some point */
1946
1947 inttype = type;
1948 inttype.floating = 0;
1949 lp_build_context_init(&intbld, bld->gallivm, inttype);
1950
1951 /* round by truncation */
1952 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1953 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1954
1955 if (type.sign) {
1956 LLVMValueRef tmp;
1957
1958 /*
1959 * fix values if rounding is wrong (for non-special cases)
1960 * - this is the case if trunc > a
1961 */
1962 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1963 /* tmp = trunc > a ? 1.0 : 0.0 */
1964 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1965 tmp = lp_build_and(&intbld, mask, tmp);
1966 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1967 res = lp_build_sub(bld, res, tmp);
1968 }
1969
1970 /* mask out sign bit */
1971 anosign = lp_build_abs(bld, a);
1972 /*
1973 * mask out all values if anosign > 2^24
1974 * This should work both for large ints (all rounding is no-op for them
1975 * because such floats are always exact) as well as special cases like
1976 * NaNs, Infs (taking advantage of the fact they use max exponent).
1977 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1978 */
1979 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1980 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1981 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1982 return lp_build_select(bld, mask, a, res);
1983 }
1984 }
1985
1986
1987 /**
1988 * Return ceiling of float (vector), returning float (vector).
1989 * Ex: ceil( 1.1) = 2.0
1990 * Ex: ceil(-1.1) = -1.0
1991 */
1992 LLVMValueRef
1993 lp_build_ceil(struct lp_build_context *bld,
1994 LLVMValueRef a)
1995 {
1996 LLVMBuilderRef builder = bld->gallivm->builder;
1997 const struct lp_type type = bld->type;
1998
1999 assert(type.floating);
2000 assert(lp_check_value(type, a));
2001
2002 if (arch_rounding_available(type)) {
2003 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2004 }
2005 else {
2006 const struct lp_type type = bld->type;
2007 struct lp_type inttype;
2008 struct lp_build_context intbld;
2009 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2010 LLVMValueRef trunc, res, anosign, mask, tmp;
2011 LLVMTypeRef int_vec_type = bld->int_vec_type;
2012 LLVMTypeRef vec_type = bld->vec_type;
2013
2014 if (type.width != 32) {
2015 char intrinsic[32];
2016 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2017 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2018 }
2019
2020 assert(type.width == 32); /* might want to handle doubles at some point */
2021
2022 inttype = type;
2023 inttype.floating = 0;
2024 lp_build_context_init(&intbld, bld->gallivm, inttype);
2025
2026 /* round by truncation */
2027 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2028 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2029
2030 /*
2031 * fix values if rounding is wrong (for non-special cases)
2032 * - this is the case if trunc < a
2033 */
2034 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2035 /* tmp = trunc < a ? 1.0 : 0.0 */
2036 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2037 tmp = lp_build_and(&intbld, mask, tmp);
2038 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2039 res = lp_build_add(bld, trunc, tmp);
2040
2041 /* mask out sign bit */
2042 anosign = lp_build_abs(bld, a);
2043 /*
2044 * mask out all values if anosign > 2^24
2045 * This should work both for large ints (all rounding is no-op for them
2046 * because such floats are always exact) as well as special cases like
2047 * NaNs, Infs (taking advantage of the fact they use max exponent).
2048 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2049 */
2050 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2051 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2052 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2053 return lp_build_select(bld, mask, a, res);
2054 }
2055 }
2056
2057
2058 /**
2059 * Return fractional part of 'a' computed as a - floor(a)
2060 * Typically used in texture coord arithmetic.
2061 */
2062 LLVMValueRef
2063 lp_build_fract(struct lp_build_context *bld,
2064 LLVMValueRef a)
2065 {
2066 assert(bld->type.floating);
2067 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2068 }
2069
2070
2071 /**
2072 * Prevent returning a fractional part of 1.0 for very small negative values of
2073 * 'a' by clamping against 0.99999(9).
2074 */
2075 static inline LLVMValueRef
2076 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2077 {
2078 LLVMValueRef max;
2079
2080 /* this is the largest number smaller than 1.0 representable as float */
2081 max = lp_build_const_vec(bld->gallivm, bld->type,
2082 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2083 return lp_build_min(bld, fract, max);
2084 }
2085
2086
2087 /**
2088 * Same as lp_build_fract, but guarantees that the result is always smaller
2089 * than one.
2090 */
2091 LLVMValueRef
2092 lp_build_fract_safe(struct lp_build_context *bld,
2093 LLVMValueRef a)
2094 {
2095 return clamp_fract(bld, lp_build_fract(bld, a));
2096 }
2097
2098
2099 /**
2100 * Return the integer part of a float (vector) value (== round toward zero).
2101 * The returned value is an integer (vector).
2102 * Ex: itrunc(-1.5) = -1
2103 */
2104 LLVMValueRef
2105 lp_build_itrunc(struct lp_build_context *bld,
2106 LLVMValueRef a)
2107 {
2108 LLVMBuilderRef builder = bld->gallivm->builder;
2109 const struct lp_type type = bld->type;
2110 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2111
2112 assert(type.floating);
2113 assert(lp_check_value(type, a));
2114
2115 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2116 }
2117
2118
2119 /**
2120 * Return float (vector) rounded to nearest integer (vector). The returned
2121 * value is an integer (vector).
2122 * Ex: iround(0.9) = 1
2123 * Ex: iround(-1.5) = -2
2124 */
2125 LLVMValueRef
2126 lp_build_iround(struct lp_build_context *bld,
2127 LLVMValueRef a)
2128 {
2129 LLVMBuilderRef builder = bld->gallivm->builder;
2130 const struct lp_type type = bld->type;
2131 LLVMTypeRef int_vec_type = bld->int_vec_type;
2132 LLVMValueRef res;
2133
2134 assert(type.floating);
2135
2136 assert(lp_check_value(type, a));
2137
2138 if ((util_cpu_caps.has_sse2 &&
2139 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2140 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2141 return lp_build_iround_nearest_sse2(bld, a);
2142 }
2143 if (arch_rounding_available(type)) {
2144 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2145 }
2146 else {
2147 LLVMValueRef half;
2148
2149 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2150
2151 if (type.sign) {
2152 LLVMTypeRef vec_type = bld->vec_type;
2153 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2154 (unsigned long long)1 << (type.width - 1));
2155 LLVMValueRef sign;
2156
2157 /* get sign bit */
2158 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2159 sign = LLVMBuildAnd(builder, sign, mask, "");
2160
2161 /* sign * 0.5 */
2162 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2163 half = LLVMBuildOr(builder, sign, half, "");
2164 half = LLVMBuildBitCast(builder, half, vec_type, "");
2165 }
2166
2167 res = LLVMBuildFAdd(builder, a, half, "");
2168 }
2169
2170 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2171
2172 return res;
2173 }
2174
2175
2176 /**
2177 * Return floor of float (vector), result is an int (vector)
2178 * Ex: ifloor(1.1) = 1.0
2179 * Ex: ifloor(-1.1) = -2.0
2180 */
2181 LLVMValueRef
2182 lp_build_ifloor(struct lp_build_context *bld,
2183 LLVMValueRef a)
2184 {
2185 LLVMBuilderRef builder = bld->gallivm->builder;
2186 const struct lp_type type = bld->type;
2187 LLVMTypeRef int_vec_type = bld->int_vec_type;
2188 LLVMValueRef res;
2189
2190 assert(type.floating);
2191 assert(lp_check_value(type, a));
2192
2193 res = a;
2194 if (type.sign) {
2195 if (arch_rounding_available(type)) {
2196 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2197 }
2198 else {
2199 struct lp_type inttype;
2200 struct lp_build_context intbld;
2201 LLVMValueRef trunc, itrunc, mask;
2202
2203 assert(type.floating);
2204 assert(lp_check_value(type, a));
2205
2206 inttype = type;
2207 inttype.floating = 0;
2208 lp_build_context_init(&intbld, bld->gallivm, inttype);
2209
2210 /* round by truncation */
2211 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2212 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2213
2214 /*
2215 * fix values if rounding is wrong (for non-special cases)
2216 * - this is the case if trunc > a
2217 * The results of doing this with NaNs, very large values etc.
2218 * are undefined but this seems to be the case anyway.
2219 */
2220 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2221 /* cheapie minus one with mask since the mask is minus one / zero */
2222 return lp_build_add(&intbld, itrunc, mask);
2223 }
2224 }
2225
2226 /* round to nearest (toward zero) */
2227 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2228
2229 return res;
2230 }
2231
2232
2233 /**
2234 * Return ceiling of float (vector), returning int (vector).
2235 * Ex: iceil( 1.1) = 2
2236 * Ex: iceil(-1.1) = -1
2237 */
2238 LLVMValueRef
2239 lp_build_iceil(struct lp_build_context *bld,
2240 LLVMValueRef a)
2241 {
2242 LLVMBuilderRef builder = bld->gallivm->builder;
2243 const struct lp_type type = bld->type;
2244 LLVMTypeRef int_vec_type = bld->int_vec_type;
2245 LLVMValueRef res;
2246
2247 assert(type.floating);
2248 assert(lp_check_value(type, a));
2249
2250 if (arch_rounding_available(type)) {
2251 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2252 }
2253 else {
2254 struct lp_type inttype;
2255 struct lp_build_context intbld;
2256 LLVMValueRef trunc, itrunc, mask;
2257
2258 assert(type.floating);
2259 assert(lp_check_value(type, a));
2260
2261 inttype = type;
2262 inttype.floating = 0;
2263 lp_build_context_init(&intbld, bld->gallivm, inttype);
2264
2265 /* round by truncation */
2266 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2267 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2268
2269 /*
2270 * fix values if rounding is wrong (for non-special cases)
2271 * - this is the case if trunc < a
2272 * The results of doing this with NaNs, very large values etc.
2273 * are undefined but this seems to be the case anyway.
2274 */
2275 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2276 /* cheapie plus one with mask since the mask is minus one / zero */
2277 return lp_build_sub(&intbld, itrunc, mask);
2278 }
2279
2280 /* round to nearest (toward zero) */
2281 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2282
2283 return res;
2284 }
2285
2286
2287 /**
2288 * Combined ifloor() & fract().
2289 *
2290 * Preferred to calling the functions separately, as it will ensure that the
2291 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2292 */
2293 void
2294 lp_build_ifloor_fract(struct lp_build_context *bld,
2295 LLVMValueRef a,
2296 LLVMValueRef *out_ipart,
2297 LLVMValueRef *out_fpart)
2298 {
2299 LLVMBuilderRef builder = bld->gallivm->builder;
2300 const struct lp_type type = bld->type;
2301 LLVMValueRef ipart;
2302
2303 assert(type.floating);
2304 assert(lp_check_value(type, a));
2305
2306 if (arch_rounding_available(type)) {
2307 /*
2308 * floor() is easier.
2309 */
2310
2311 ipart = lp_build_floor(bld, a);
2312 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2313 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2314 }
2315 else {
2316 /*
2317 * ifloor() is easier.
2318 */
2319
2320 *out_ipart = lp_build_ifloor(bld, a);
2321 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2322 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2323 }
2324 }
2325
2326
2327 /**
2328 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2329 * always smaller than one.
2330 */
2331 void
2332 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2333 LLVMValueRef a,
2334 LLVMValueRef *out_ipart,
2335 LLVMValueRef *out_fpart)
2336 {
2337 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2338 *out_fpart = clamp_fract(bld, *out_fpart);
2339 }
2340
2341
2342 LLVMValueRef
2343 lp_build_sqrt(struct lp_build_context *bld,
2344 LLVMValueRef a)
2345 {
2346 LLVMBuilderRef builder = bld->gallivm->builder;
2347 const struct lp_type type = bld->type;
2348 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2349 char intrinsic[32];
2350
2351 assert(lp_check_value(type, a));
2352
2353 assert(type.floating);
2354 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2355
2356 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2357 }
2358
2359
2360 /**
2361 * Do one Newton-Raphson step to improve reciprocate precision:
2362 *
2363 * x_{i+1} = x_i * (2 - a * x_i)
2364 *
2365 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2366 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2367 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2368 * halo. It would be necessary to clamp the argument to prevent this.
2369 *
2370 * See also:
2371 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2372 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2373 */
2374 static inline LLVMValueRef
2375 lp_build_rcp_refine(struct lp_build_context *bld,
2376 LLVMValueRef a,
2377 LLVMValueRef rcp_a)
2378 {
2379 LLVMBuilderRef builder = bld->gallivm->builder;
2380 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2381 LLVMValueRef res;
2382
2383 res = LLVMBuildFMul(builder, a, rcp_a, "");
2384 res = LLVMBuildFSub(builder, two, res, "");
2385 res = LLVMBuildFMul(builder, rcp_a, res, "");
2386
2387 return res;
2388 }
2389
2390
2391 LLVMValueRef
2392 lp_build_rcp(struct lp_build_context *bld,
2393 LLVMValueRef a)
2394 {
2395 LLVMBuilderRef builder = bld->gallivm->builder;
2396 const struct lp_type type = bld->type;
2397
2398 assert(lp_check_value(type, a));
2399
2400 if(a == bld->zero)
2401 return bld->undef;
2402 if(a == bld->one)
2403 return bld->one;
2404 if(a == bld->undef)
2405 return bld->undef;
2406
2407 assert(type.floating);
2408
2409 if(LLVMIsConstant(a))
2410 return LLVMConstFDiv(bld->one, a);
2411
2412 /*
2413 * We don't use RCPPS because:
2414 * - it only has 10bits of precision
2415 * - it doesn't even get the reciprocate of 1.0 exactly
2416 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2417 * - for recent processors the benefit over DIVPS is marginal, a case
2418 * dependent
2419 *
2420 * We could still use it on certain processors if benchmarks show that the
2421 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2422 * particular uses that require less workarounds.
2423 */
2424
2425 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2426 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2427 const unsigned num_iterations = 0;
2428 LLVMValueRef res;
2429 unsigned i;
2430 const char *intrinsic = NULL;
2431
2432 if (type.length == 4) {
2433 intrinsic = "llvm.x86.sse.rcp.ps";
2434 }
2435 else {
2436 intrinsic = "llvm.x86.avx.rcp.ps.256";
2437 }
2438
2439 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2440
2441 for (i = 0; i < num_iterations; ++i) {
2442 res = lp_build_rcp_refine(bld, a, res);
2443 }
2444
2445 return res;
2446 }
2447
2448 return LLVMBuildFDiv(builder, bld->one, a, "");
2449 }
2450
2451
2452 /**
2453 * Do one Newton-Raphson step to improve rsqrt precision:
2454 *
2455 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2456 *
2457 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2458 */
2459 static inline LLVMValueRef
2460 lp_build_rsqrt_refine(struct lp_build_context *bld,
2461 LLVMValueRef a,
2462 LLVMValueRef rsqrt_a)
2463 {
2464 LLVMBuilderRef builder = bld->gallivm->builder;
2465 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2466 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2467 LLVMValueRef res;
2468
2469 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2470 res = LLVMBuildFMul(builder, a, res, "");
2471 res = LLVMBuildFSub(builder, three, res, "");
2472 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2473 res = LLVMBuildFMul(builder, half, res, "");
2474
2475 return res;
2476 }
2477
2478
2479 /**
2480 * Generate 1/sqrt(a).
2481 * Result is undefined for values < 0, infinity for +0.
2482 */
2483 LLVMValueRef
2484 lp_build_rsqrt(struct lp_build_context *bld,
2485 LLVMValueRef a)
2486 {
2487 const struct lp_type type = bld->type;
2488
2489 assert(lp_check_value(type, a));
2490
2491 assert(type.floating);
2492
2493 /*
2494 * This should be faster but all denormals will end up as infinity.
2495 */
2496 if (0 && lp_build_fast_rsqrt_available(type)) {
2497 const unsigned num_iterations = 1;
2498 LLVMValueRef res;
2499 unsigned i;
2500
2501 /* rsqrt(1.0) != 1.0 here */
2502 res = lp_build_fast_rsqrt(bld, a);
2503
2504 if (num_iterations) {
2505 /*
2506 * Newton-Raphson will result in NaN instead of infinity for zero,
2507 * and NaN instead of zero for infinity.
2508 * Also, need to ensure rsqrt(1.0) == 1.0.
2509 * All numbers smaller than FLT_MIN will result in +infinity
2510 * (rsqrtps treats all denormals as zero).
2511 */
2512 LLVMValueRef cmp;
2513 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2514 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2515
2516 for (i = 0; i < num_iterations; ++i) {
2517 res = lp_build_rsqrt_refine(bld, a, res);
2518 }
2519 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2520 res = lp_build_select(bld, cmp, inf, res);
2521 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2522 res = lp_build_select(bld, cmp, bld->zero, res);
2523 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2524 res = lp_build_select(bld, cmp, bld->one, res);
2525 }
2526
2527 return res;
2528 }
2529
2530 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2531 }
2532
2533 /**
2534 * If there's a fast (inaccurate) rsqrt instruction available
2535 * (caller may want to avoid to call rsqrt_fast if it's not available,
2536 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2537 * unavailable it would result in sqrt/div/mul so obviously
2538 * much better to just call sqrt, skipping both div and mul).
2539 */
2540 boolean
2541 lp_build_fast_rsqrt_available(struct lp_type type)
2542 {
2543 assert(type.floating);
2544
2545 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2546 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2547 return true;
2548 }
2549 return false;
2550 }
2551
2552
2553 /**
2554 * Generate 1/sqrt(a).
2555 * Result is undefined for values < 0, infinity for +0.
2556 * Precision is limited, only ~10 bits guaranteed
2557 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2558 */
2559 LLVMValueRef
2560 lp_build_fast_rsqrt(struct lp_build_context *bld,
2561 LLVMValueRef a)
2562 {
2563 LLVMBuilderRef builder = bld->gallivm->builder;
2564 const struct lp_type type = bld->type;
2565
2566 assert(lp_check_value(type, a));
2567
2568 if (lp_build_fast_rsqrt_available(type)) {
2569 const char *intrinsic = NULL;
2570
2571 if (type.length == 4) {
2572 intrinsic = "llvm.x86.sse.rsqrt.ps";
2573 }
2574 else {
2575 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2576 }
2577 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2578 }
2579 else {
2580 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2581 }
2582 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2583 }
2584
2585
2586 /**
2587 * Generate sin(a) or cos(a) using polynomial approximation.
2588 * TODO: it might be worth recognizing sin and cos using same source
2589 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2590 * would be way cheaper than calculating (nearly) everything twice...
2591 * Not sure it's common enough to be worth bothering however, scs
2592 * opcode could also benefit from calculating both though.
2593 */
2594 static LLVMValueRef
2595 lp_build_sin_or_cos(struct lp_build_context *bld,
2596 LLVMValueRef a,
2597 boolean cos)
2598 {
2599 struct gallivm_state *gallivm = bld->gallivm;
2600 LLVMBuilderRef b = gallivm->builder;
2601 struct lp_type int_type = lp_int_type(bld->type);
2602
2603 /*
2604 * take the absolute value,
2605 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2606 */
2607
2608 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2609 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2610
2611 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2612 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2613
2614 /*
2615 * scale by 4/Pi
2616 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2617 */
2618
2619 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2620 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2621
2622 /*
2623 * store the integer part of y in mm0
2624 * emm2 = _mm_cvttps_epi32(y);
2625 */
2626
2627 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2628
2629 /*
2630 * j=(j+1) & (~1) (see the cephes sources)
2631 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2632 */
2633
2634 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2635 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2636 /*
2637 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2638 */
2639 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2640 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2641
2642 /*
2643 * y = _mm_cvtepi32_ps(emm2);
2644 */
2645 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2646
2647 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2648 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2649 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2650 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2651
2652 /*
2653 * Argument used for poly selection and sign bit determination
2654 * is different for sin vs. cos.
2655 */
2656 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2657 emm2_and;
2658
2659 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2660 LLVMBuildNot(b, emm2_2, ""), ""),
2661 const_29, "sign_bit") :
2662 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2663 LLVMBuildShl(b, emm2_add,
2664 const_29, ""), ""),
2665 sign_mask, "sign_bit");
2666
2667 /*
2668 * get the polynom selection mask
2669 * there is one polynom for 0 <= x <= Pi/4
2670 * and another one for Pi/4<x<=Pi/2
2671 * Both branches will be computed.
2672 *
2673 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2674 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2675 */
2676
2677 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2678 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2679 int_type, PIPE_FUNC_EQUAL,
2680 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2681
2682 /*
2683 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2684 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2685 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2686 */
2687 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2688 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2689 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2690
2691 /*
2692 * The magic pass: "Extended precision modular arithmetic"
2693 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2694 * xmm1 = _mm_mul_ps(y, xmm1);
2695 * xmm2 = _mm_mul_ps(y, xmm2);
2696 * xmm3 = _mm_mul_ps(y, xmm3);
2697 */
2698 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2699 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2700 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2701
2702 /*
2703 * x = _mm_add_ps(x, xmm1);
2704 * x = _mm_add_ps(x, xmm2);
2705 * x = _mm_add_ps(x, xmm3);
2706 */
2707
2708 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2709 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2710 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2711
2712 /*
2713 * Evaluate the first polynom (0 <= x <= Pi/4)
2714 *
2715 * z = _mm_mul_ps(x,x);
2716 */
2717 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2718
2719 /*
2720 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2721 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2722 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2723 */
2724 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2725 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2726 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2727
2728 /*
2729 * y = *(v4sf*)_ps_coscof_p0;
2730 * y = _mm_mul_ps(y, z);
2731 */
2732 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2733 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2734 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2735 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2736 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2737 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2738
2739
2740 /*
2741 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2742 * y = _mm_sub_ps(y, tmp);
2743 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2744 */
2745 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2746 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2747 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2748 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2749 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2750
2751 /*
2752 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2753 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2754 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2755 */
2756 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2757 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2758 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2759
2760 /*
2761 * Evaluate the second polynom (Pi/4 <= x <= 0)
2762 *
2763 * y2 = *(v4sf*)_ps_sincof_p0;
2764 * y2 = _mm_mul_ps(y2, z);
2765 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2766 * y2 = _mm_mul_ps(y2, z);
2767 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2768 * y2 = _mm_mul_ps(y2, z);
2769 * y2 = _mm_mul_ps(y2, x);
2770 * y2 = _mm_add_ps(y2, x);
2771 */
2772
2773 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2774 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2775 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2776 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2777 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2778 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2779 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2780
2781 /*
2782 * select the correct result from the two polynoms
2783 * xmm3 = poly_mask;
2784 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2785 * y = _mm_andnot_ps(xmm3, y);
2786 * y = _mm_or_ps(y,y2);
2787 */
2788 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2789 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2790 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2791 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2792 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2793 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2794
2795 /*
2796 * update the sign
2797 * y = _mm_xor_ps(y, sign_bit);
2798 */
2799 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2800 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2801
2802 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2803
2804 /* clamp output to be within [-1, 1] */
2805 y_result = lp_build_clamp(bld, y_result,
2806 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2807 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2808 /* If a is -inf, inf or NaN then return NaN */
2809 y_result = lp_build_select(bld, isfinite, y_result,
2810 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2811 return y_result;
2812 }
2813
2814
2815 /**
2816 * Generate sin(a)
2817 */
2818 LLVMValueRef
2819 lp_build_sin(struct lp_build_context *bld,
2820 LLVMValueRef a)
2821 {
2822 return lp_build_sin_or_cos(bld, a, FALSE);
2823 }
2824
2825
2826 /**
2827 * Generate cos(a)
2828 */
2829 LLVMValueRef
2830 lp_build_cos(struct lp_build_context *bld,
2831 LLVMValueRef a)
2832 {
2833 return lp_build_sin_or_cos(bld, a, TRUE);
2834 }
2835
2836
2837 /**
2838 * Generate pow(x, y)
2839 */
2840 LLVMValueRef
2841 lp_build_pow(struct lp_build_context *bld,
2842 LLVMValueRef x,
2843 LLVMValueRef y)
2844 {
2845 /* TODO: optimize the constant case */
2846 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2847 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2848 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2849 __FUNCTION__);
2850 }
2851
2852 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2853 }
2854
2855
2856 /**
2857 * Generate exp(x)
2858 */
2859 LLVMValueRef
2860 lp_build_exp(struct lp_build_context *bld,
2861 LLVMValueRef x)
2862 {
2863 /* log2(e) = 1/log(2) */
2864 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2865 1.4426950408889634);
2866
2867 assert(lp_check_value(bld->type, x));
2868
2869 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2870 }
2871
2872
2873 /**
2874 * Generate log(x)
2875 * Behavior is undefined with infs, 0s and nans
2876 */
2877 LLVMValueRef
2878 lp_build_log(struct lp_build_context *bld,
2879 LLVMValueRef x)
2880 {
2881 /* log(2) */
2882 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2883 0.69314718055994529);
2884
2885 assert(lp_check_value(bld->type, x));
2886
2887 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2888 }
2889
2890 /**
2891 * Generate log(x) that handles edge cases (infs, 0s and nans)
2892 */
2893 LLVMValueRef
2894 lp_build_log_safe(struct lp_build_context *bld,
2895 LLVMValueRef x)
2896 {
2897 /* log(2) */
2898 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2899 0.69314718055994529);
2900
2901 assert(lp_check_value(bld->type, x));
2902
2903 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2904 }
2905
2906
2907 /**
2908 * Generate polynomial.
2909 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2910 */
2911 LLVMValueRef
2912 lp_build_polynomial(struct lp_build_context *bld,
2913 LLVMValueRef x,
2914 const double *coeffs,
2915 unsigned num_coeffs)
2916 {
2917 const struct lp_type type = bld->type;
2918 LLVMValueRef even = NULL, odd = NULL;
2919 LLVMValueRef x2;
2920 unsigned i;
2921
2922 assert(lp_check_value(bld->type, x));
2923
2924 /* TODO: optimize the constant case */
2925 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2926 LLVMIsConstant(x)) {
2927 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2928 __FUNCTION__);
2929 }
2930
2931 /*
2932 * Calculate odd and even terms seperately to decrease data dependency
2933 * Ex:
2934 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2935 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2936 */
2937 x2 = lp_build_mul(bld, x, x);
2938
2939 for (i = num_coeffs; i--; ) {
2940 LLVMValueRef coeff;
2941
2942 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2943
2944 if (i % 2 == 0) {
2945 if (even)
2946 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2947 else
2948 even = coeff;
2949 } else {
2950 if (odd)
2951 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2952 else
2953 odd = coeff;
2954 }
2955 }
2956
2957 if (odd)
2958 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2959 else if (even)
2960 return even;
2961 else
2962 return bld->undef;
2963 }
2964
2965
2966 /**
2967 * Minimax polynomial fit of 2**x, in range [0, 1[
2968 */
2969 const double lp_build_exp2_polynomial[] = {
2970 #if EXP_POLY_DEGREE == 5
2971 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
2972 0.693153073200168932794,
2973 0.240153617044375388211,
2974 0.0558263180532956664775,
2975 0.00898934009049466391101,
2976 0.00187757667519147912699
2977 #elif EXP_POLY_DEGREE == 4
2978 1.00000259337069434683,
2979 0.693003834469974940458,
2980 0.24144275689150793076,
2981 0.0520114606103070150235,
2982 0.0135341679161270268764
2983 #elif EXP_POLY_DEGREE == 3
2984 0.999925218562710312959,
2985 0.695833540494823811697,
2986 0.226067155427249155588,
2987 0.0780245226406372992967
2988 #elif EXP_POLY_DEGREE == 2
2989 1.00172476321474503578,
2990 0.657636275736077639316,
2991 0.33718943461968720704
2992 #else
2993 #error
2994 #endif
2995 };
2996
2997
2998 LLVMValueRef
2999 lp_build_exp2(struct lp_build_context *bld,
3000 LLVMValueRef x)
3001 {
3002 LLVMBuilderRef builder = bld->gallivm->builder;
3003 const struct lp_type type = bld->type;
3004 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3005 LLVMValueRef ipart = NULL;
3006 LLVMValueRef fpart = NULL;
3007 LLVMValueRef expipart = NULL;
3008 LLVMValueRef expfpart = NULL;
3009 LLVMValueRef res = NULL;
3010
3011 assert(lp_check_value(bld->type, x));
3012
3013 /* TODO: optimize the constant case */
3014 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3015 LLVMIsConstant(x)) {
3016 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3017 __FUNCTION__);
3018 }
3019
3020 assert(type.floating && type.width == 32);
3021
3022 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3023 * the result is INF and if it's smaller than -126.9 the result is 0 */
3024 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3025 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3026 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3027 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3028
3029 /* ipart = floor(x) */
3030 /* fpart = x - ipart */
3031 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3032
3033 /* expipart = (float) (1 << ipart) */
3034 expipart = LLVMBuildAdd(builder, ipart,
3035 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3036 expipart = LLVMBuildShl(builder, expipart,
3037 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3038 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3039
3040 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3041 Elements(lp_build_exp2_polynomial));
3042
3043 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3044
3045 return res;
3046 }
3047
3048
3049
3050 /**
3051 * Extract the exponent of a IEEE-754 floating point value.
3052 *
3053 * Optionally apply an integer bias.
3054 *
3055 * Result is an integer value with
3056 *
3057 * ifloor(log2(x)) + bias
3058 */
3059 LLVMValueRef
3060 lp_build_extract_exponent(struct lp_build_context *bld,
3061 LLVMValueRef x,
3062 int bias)
3063 {
3064 LLVMBuilderRef builder = bld->gallivm->builder;
3065 const struct lp_type type = bld->type;
3066 unsigned mantissa = lp_mantissa(type);
3067 LLVMValueRef res;
3068
3069 assert(type.floating);
3070
3071 assert(lp_check_value(bld->type, x));
3072
3073 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3074
3075 res = LLVMBuildLShr(builder, x,
3076 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3077 res = LLVMBuildAnd(builder, res,
3078 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3079 res = LLVMBuildSub(builder, res,
3080 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3081
3082 return res;
3083 }
3084
3085
3086 /**
3087 * Extract the mantissa of the a floating.
3088 *
3089 * Result is a floating point value with
3090 *
3091 * x / floor(log2(x))
3092 */
3093 LLVMValueRef
3094 lp_build_extract_mantissa(struct lp_build_context *bld,
3095 LLVMValueRef x)
3096 {
3097 LLVMBuilderRef builder = bld->gallivm->builder;
3098 const struct lp_type type = bld->type;
3099 unsigned mantissa = lp_mantissa(type);
3100 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3101 (1ULL << mantissa) - 1);
3102 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3103 LLVMValueRef res;
3104
3105 assert(lp_check_value(bld->type, x));
3106
3107 assert(type.floating);
3108
3109 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3110
3111 /* res = x / 2**ipart */
3112 res = LLVMBuildAnd(builder, x, mantmask, "");
3113 res = LLVMBuildOr(builder, res, one, "");
3114 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3115
3116 return res;
3117 }
3118
3119
3120
3121 /**
3122 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3123 * These coefficients can be generate with
3124 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3125 */
3126 const double lp_build_log2_polynomial[] = {
3127 #if LOG_POLY_DEGREE == 5
3128 2.88539008148777786488L,
3129 0.961796878841293367824L,
3130 0.577058946784739859012L,
3131 0.412914355135828735411L,
3132 0.308591899232910175289L,
3133 0.352376952300281371868L,
3134 #elif LOG_POLY_DEGREE == 4
3135 2.88539009343309178325L,
3136 0.961791550404184197881L,
3137 0.577440339438736392009L,
3138 0.403343858251329912514L,
3139 0.406718052498846252698L,
3140 #elif LOG_POLY_DEGREE == 3
3141 2.88538959748872753838L,
3142 0.961932915889597772928L,
3143 0.571118517972136195241L,
3144 0.493997535084709500285L,
3145 #else
3146 #error
3147 #endif
3148 };
3149
3150 /**
3151 * See http://www.devmaster.net/forums/showthread.php?p=43580
3152 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3153 * http://www.nezumi.demon.co.uk/consult/logx.htm
3154 *
3155 * If handle_edge_cases is true the function will perform computations
3156 * to match the required D3D10+ behavior for each of the edge cases.
3157 * That means that if input is:
3158 * - less than zero (to and including -inf) then NaN will be returned
3159 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3160 * - +infinity, then +infinity will be returned
3161 * - NaN, then NaN will be returned
3162 *
3163 * Those checks are fairly expensive so if you don't need them make sure
3164 * handle_edge_cases is false.
3165 */
3166 void
3167 lp_build_log2_approx(struct lp_build_context *bld,
3168 LLVMValueRef x,
3169 LLVMValueRef *p_exp,
3170 LLVMValueRef *p_floor_log2,
3171 LLVMValueRef *p_log2,
3172 boolean handle_edge_cases)
3173 {
3174 LLVMBuilderRef builder = bld->gallivm->builder;
3175 const struct lp_type type = bld->type;
3176 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3177 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3178
3179 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3180 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3181 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3182
3183 LLVMValueRef i = NULL;
3184 LLVMValueRef y = NULL;
3185 LLVMValueRef z = NULL;
3186 LLVMValueRef exp = NULL;
3187 LLVMValueRef mant = NULL;
3188 LLVMValueRef logexp = NULL;
3189 LLVMValueRef logmant = NULL;
3190 LLVMValueRef res = NULL;
3191
3192 assert(lp_check_value(bld->type, x));
3193
3194 if(p_exp || p_floor_log2 || p_log2) {
3195 /* TODO: optimize the constant case */
3196 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3197 LLVMIsConstant(x)) {
3198 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3199 __FUNCTION__);
3200 }
3201
3202 assert(type.floating && type.width == 32);
3203
3204 /*
3205 * We don't explicitly handle denormalized numbers. They will yield a
3206 * result in the neighbourhood of -127, which appears to be adequate
3207 * enough.
3208 */
3209
3210 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3211
3212 /* exp = (float) exponent(x) */
3213 exp = LLVMBuildAnd(builder, i, expmask, "");
3214 }
3215
3216 if(p_floor_log2 || p_log2) {
3217 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3218 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3219 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3220 }
3221
3222 if (p_log2) {
3223 /* mant = 1 + (float) mantissa(x) */
3224 mant = LLVMBuildAnd(builder, i, mantmask, "");
3225 mant = LLVMBuildOr(builder, mant, one, "");
3226 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3227
3228 /* y = (mant - 1) / (mant + 1) */
3229 y = lp_build_div(bld,
3230 lp_build_sub(bld, mant, bld->one),
3231 lp_build_add(bld, mant, bld->one)
3232 );
3233
3234 /* z = y^2 */
3235 z = lp_build_mul(bld, y, y);
3236
3237 /* compute P(z) */
3238 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3239 Elements(lp_build_log2_polynomial));
3240
3241 /* logmant = y * P(z) */
3242 logmant = lp_build_mul(bld, y, logmant);
3243
3244 res = lp_build_add(bld, logmant, logexp);
3245
3246 if (type.floating && handle_edge_cases) {
3247 LLVMValueRef negmask, infmask, zmask;
3248 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3249 lp_build_const_vec(bld->gallivm, type, 0.0f));
3250 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3251 lp_build_const_vec(bld->gallivm, type, 0.0f));
3252 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3253 lp_build_const_vec(bld->gallivm, type, INFINITY));
3254
3255 /* If x is qual to inf make sure we return inf */
3256 res = lp_build_select(bld, infmask,
3257 lp_build_const_vec(bld->gallivm, type, INFINITY),
3258 res);
3259 /* If x is qual to 0, return -inf */
3260 res = lp_build_select(bld, zmask,
3261 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3262 res);
3263 /* If x is nan or less than 0, return nan */
3264 res = lp_build_select(bld, negmask,
3265 lp_build_const_vec(bld->gallivm, type, NAN),
3266 res);
3267 }
3268 }
3269
3270 if (p_exp) {
3271 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3272 *p_exp = exp;
3273 }
3274
3275 if (p_floor_log2)
3276 *p_floor_log2 = logexp;
3277
3278 if (p_log2)
3279 *p_log2 = res;
3280 }
3281
3282
3283 /*
3284 * log2 implementation which doesn't have special code to
3285 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3286 * the results for those cases are undefined.
3287 */
3288 LLVMValueRef
3289 lp_build_log2(struct lp_build_context *bld,
3290 LLVMValueRef x)
3291 {
3292 LLVMValueRef res;
3293 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3294 return res;
3295 }
3296
3297 /*
3298 * Version of log2 which handles all edge cases.
3299 * Look at documentation of lp_build_log2_approx for
3300 * description of the behavior for each of the edge cases.
3301 */
3302 LLVMValueRef
3303 lp_build_log2_safe(struct lp_build_context *bld,
3304 LLVMValueRef x)
3305 {
3306 LLVMValueRef res;
3307 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3308 return res;
3309 }
3310
3311
3312 /**
3313 * Faster (and less accurate) log2.
3314 *
3315 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3316 *
3317 * Piece-wise linear approximation, with exact results when x is a
3318 * power of two.
3319 *
3320 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3321 */
3322 LLVMValueRef
3323 lp_build_fast_log2(struct lp_build_context *bld,
3324 LLVMValueRef x)
3325 {
3326 LLVMBuilderRef builder = bld->gallivm->builder;
3327 LLVMValueRef ipart;
3328 LLVMValueRef fpart;
3329
3330 assert(lp_check_value(bld->type, x));
3331
3332 assert(bld->type.floating);
3333
3334 /* ipart = floor(log2(x)) - 1 */
3335 ipart = lp_build_extract_exponent(bld, x, -1);
3336 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3337
3338 /* fpart = x / 2**ipart */
3339 fpart = lp_build_extract_mantissa(bld, x);
3340
3341 /* ipart + fpart */
3342 return LLVMBuildFAdd(builder, ipart, fpart, "");
3343 }
3344
3345
3346 /**
3347 * Fast implementation of iround(log2(x)).
3348 *
3349 * Not an approximation -- it should give accurate results all the time.
3350 */
3351 LLVMValueRef
3352 lp_build_ilog2(struct lp_build_context *bld,
3353 LLVMValueRef x)
3354 {
3355 LLVMBuilderRef builder = bld->gallivm->builder;
3356 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3357 LLVMValueRef ipart;
3358
3359 assert(bld->type.floating);
3360
3361 assert(lp_check_value(bld->type, x));
3362
3363 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3364 x = LLVMBuildFMul(builder, x, sqrt2, "");
3365
3366 /* ipart = floor(log2(x) + 0.5) */
3367 ipart = lp_build_extract_exponent(bld, x, 0);
3368
3369 return ipart;
3370 }
3371
3372 LLVMValueRef
3373 lp_build_mod(struct lp_build_context *bld,
3374 LLVMValueRef x,
3375 LLVMValueRef y)
3376 {
3377 LLVMBuilderRef builder = bld->gallivm->builder;
3378 LLVMValueRef res;
3379 const struct lp_type type = bld->type;
3380
3381 assert(lp_check_value(type, x));
3382 assert(lp_check_value(type, y));
3383
3384 if (type.floating)
3385 res = LLVMBuildFRem(builder, x, y, "");
3386 else if (type.sign)
3387 res = LLVMBuildSRem(builder, x, y, "");
3388 else
3389 res = LLVMBuildURem(builder, x, y, "");
3390 return res;
3391 }
3392
3393
3394 /*
3395 * For floating inputs it creates and returns a mask
3396 * which is all 1's for channels which are NaN.
3397 * Channels inside x which are not NaN will be 0.
3398 */
3399 LLVMValueRef
3400 lp_build_isnan(struct lp_build_context *bld,
3401 LLVMValueRef x)
3402 {
3403 LLVMValueRef mask;
3404 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3405
3406 assert(bld->type.floating);
3407 assert(lp_check_value(bld->type, x));
3408
3409 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3410 "isnotnan");
3411 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3412 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3413 return mask;
3414 }
3415
3416 /* Returns all 1's for floating point numbers that are
3417 * finite numbers and returns all zeros for -inf,
3418 * inf and nan's */
3419 LLVMValueRef
3420 lp_build_isfinite(struct lp_build_context *bld,
3421 LLVMValueRef x)
3422 {
3423 LLVMBuilderRef builder = bld->gallivm->builder;
3424 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3425 struct lp_type int_type = lp_int_type(bld->type);
3426 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3427 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3428 0x7f800000);
3429
3430 if (!bld->type.floating) {
3431 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3432 }
3433 assert(bld->type.floating);
3434 assert(lp_check_value(bld->type, x));
3435 assert(bld->type.width == 32);
3436
3437 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3438 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3439 intx, infornan32);
3440 }
3441
3442 /*
3443 * Returns true if the number is nan or inf and false otherwise.
3444 * The input has to be a floating point vector.
3445 */
3446 LLVMValueRef
3447 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3448 const struct lp_type type,
3449 LLVMValueRef x)
3450 {
3451 LLVMBuilderRef builder = gallivm->builder;
3452 struct lp_type int_type = lp_int_type(type);
3453 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3454 0x7f800000);
3455 LLVMValueRef ret;
3456
3457 assert(type.floating);
3458
3459 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3460 ret = LLVMBuildAnd(builder, ret, const0, "");
3461 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3462 ret, const0);
3463
3464 return ret;
3465 }
3466
3467
3468 LLVMValueRef
3469 lp_build_fpstate_get(struct gallivm_state *gallivm)
3470 {
3471 if (util_cpu_caps.has_sse) {
3472 LLVMBuilderRef builder = gallivm->builder;
3473 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3474 gallivm,
3475 LLVMInt32TypeInContext(gallivm->context),
3476 "mxcsr_ptr");
3477 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3478 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3479 lp_build_intrinsic(builder,
3480 "llvm.x86.sse.stmxcsr",
3481 LLVMVoidTypeInContext(gallivm->context),
3482 &mxcsr_ptr8, 1, 0);
3483 return mxcsr_ptr;
3484 }
3485 return 0;
3486 }
3487
3488 void
3489 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3490 boolean zero)
3491 {
3492 if (util_cpu_caps.has_sse) {
3493 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3494 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3495
3496 LLVMBuilderRef builder = gallivm->builder;
3497 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3498 LLVMValueRef mxcsr =
3499 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3500
3501 if (util_cpu_caps.has_daz) {
3502 /* Enable denormals are zero mode */
3503 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3504 }
3505 if (zero) {
3506 mxcsr = LLVMBuildOr(builder, mxcsr,
3507 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3508 } else {
3509 mxcsr = LLVMBuildAnd(builder, mxcsr,
3510 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3511 }
3512
3513 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3514 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3515 }
3516 }
3517
3518 void
3519 lp_build_fpstate_set(struct gallivm_state *gallivm,
3520 LLVMValueRef mxcsr_ptr)
3521 {
3522 if (util_cpu_caps.has_sse) {
3523 LLVMBuilderRef builder = gallivm->builder;
3524 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3525 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3526 lp_build_intrinsic(builder,
3527 "llvm.x86.sse.ldmxcsr",
3528 LLVMVoidTypeInContext(gallivm->context),
3529 &mxcsr_ptr, 1, 0);
3530 }
3531 }