gallivm: Use llvm.fmuladd.*.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
146 intr_size = 128;
147 if ((type.width == 8 || type.width == 16) &&
148 (type.width * type.length <= 64) &&
149 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
150 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
151 __FUNCTION__);
152 }
153 if (type.width == 8 && !type.sign) {
154 intrinsic = "llvm.x86.sse2.pminu.b";
155 }
156 else if (type.width == 16 && type.sign) {
157 intrinsic = "llvm.x86.sse2.pmins.w";
158 }
159 if (util_cpu_caps.has_sse4_1) {
160 if (type.width == 8 && type.sign) {
161 intrinsic = "llvm.x86.sse41.pminsb";
162 }
163 if (type.width == 16 && !type.sign) {
164 intrinsic = "llvm.x86.sse41.pminuw";
165 }
166 if (type.width == 32 && !type.sign) {
167 intrinsic = "llvm.x86.sse41.pminud";
168 }
169 if (type.width == 32 && type.sign) {
170 intrinsic = "llvm.x86.sse41.pminsd";
171 }
172 }
173 } else if (util_cpu_caps.has_altivec) {
174 intr_size = 128;
175 if (type.width == 8) {
176 if (!type.sign) {
177 intrinsic = "llvm.ppc.altivec.vminub";
178 } else {
179 intrinsic = "llvm.ppc.altivec.vminsb";
180 }
181 } else if (type.width == 16) {
182 if (!type.sign) {
183 intrinsic = "llvm.ppc.altivec.vminuh";
184 } else {
185 intrinsic = "llvm.ppc.altivec.vminsh";
186 }
187 } else if (type.width == 32) {
188 if (!type.sign) {
189 intrinsic = "llvm.ppc.altivec.vminuw";
190 } else {
191 intrinsic = "llvm.ppc.altivec.vminsw";
192 }
193 }
194 }
195
196 if (intrinsic) {
197 /* We need to handle nan's for floating point numbers. If one of the
198 * inputs is nan the other should be returned (required by both D3D10+
199 * and OpenCL).
200 * The sse intrinsics return the second operator in case of nan by
201 * default so we need to special code to handle those.
202 */
203 if (util_cpu_caps.has_sse && type.floating &&
204 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
205 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
206 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
207 LLVMValueRef isnan, min;
208 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
209 type,
210 intr_size, a, b);
211 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
212 isnan = lp_build_isnan(bld, b);
213 return lp_build_select(bld, isnan, a, min);
214 } else {
215 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
216 isnan = lp_build_isnan(bld, a);
217 return lp_build_select(bld, isnan, a, min);
218 }
219 } else {
220 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
221 type,
222 intr_size, a, b);
223 }
224 }
225
226 if (type.floating) {
227 switch (nan_behavior) {
228 case GALLIVM_NAN_RETURN_NAN: {
229 LLVMValueRef isnan = lp_build_isnan(bld, b);
230 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
231 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
232 return lp_build_select(bld, cond, a, b);
233 }
234 break;
235 case GALLIVM_NAN_RETURN_OTHER: {
236 LLVMValueRef isnan = lp_build_isnan(bld, a);
237 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
238 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
239 return lp_build_select(bld, cond, a, b);
240 }
241 break;
242 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
243 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
244 return lp_build_select(bld, cond, a, b);
245 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
246 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
247 return lp_build_select(bld, cond, b, a);
248 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
249 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
250 return lp_build_select(bld, cond, a, b);
251 break;
252 default:
253 assert(0);
254 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
255 return lp_build_select(bld, cond, a, b);
256 }
257 } else {
258 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 }
261 }
262
263
264 LLVMValueRef
265 lp_build_fmuladd(LLVMBuilderRef builder,
266 LLVMValueRef a,
267 LLVMValueRef b,
268 LLVMValueRef c)
269 {
270 LLVMTypeRef type = LLVMTypeOf(a);
271 assert(type == LLVMTypeOf(b));
272 assert(type == LLVMTypeOf(c));
273 char intrinsic[32];
274 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
275 LLVMValueRef args[] = { a, b, c };
276 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
277 }
278
279
280 /**
281 * Generate max(a, b)
282 * No checks for special case values of a or b = 1 or 0 are done.
283 * NaN's are handled according to the behavior specified by the
284 * nan_behavior argument.
285 */
286 static LLVMValueRef
287 lp_build_max_simple(struct lp_build_context *bld,
288 LLVMValueRef a,
289 LLVMValueRef b,
290 enum gallivm_nan_behavior nan_behavior)
291 {
292 const struct lp_type type = bld->type;
293 const char *intrinsic = NULL;
294 unsigned intr_size = 0;
295 LLVMValueRef cond;
296
297 assert(lp_check_value(type, a));
298 assert(lp_check_value(type, b));
299
300 /* TODO: optimize the constant case */
301
302 if (type.floating && util_cpu_caps.has_sse) {
303 if (type.width == 32) {
304 if (type.length == 1) {
305 intrinsic = "llvm.x86.sse.max.ss";
306 intr_size = 128;
307 }
308 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
309 intrinsic = "llvm.x86.sse.max.ps";
310 intr_size = 128;
311 }
312 else {
313 intrinsic = "llvm.x86.avx.max.ps.256";
314 intr_size = 256;
315 }
316 }
317 if (type.width == 64 && util_cpu_caps.has_sse2) {
318 if (type.length == 1) {
319 intrinsic = "llvm.x86.sse2.max.sd";
320 intr_size = 128;
321 }
322 else if (type.length == 2 || !util_cpu_caps.has_avx) {
323 intrinsic = "llvm.x86.sse2.max.pd";
324 intr_size = 128;
325 }
326 else {
327 intrinsic = "llvm.x86.avx.max.pd.256";
328 intr_size = 256;
329 }
330 }
331 }
332 else if (type.floating && util_cpu_caps.has_altivec) {
333 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
334 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
335 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
336 __FUNCTION__);
337 }
338 if (type.width == 32 || type.length == 4) {
339 intrinsic = "llvm.ppc.altivec.vmaxfp";
340 intr_size = 128;
341 }
342 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
343 intr_size = 128;
344 if ((type.width == 8 || type.width == 16) &&
345 (type.width * type.length <= 64) &&
346 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
347 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
348 __FUNCTION__);
349 }
350 if (type.width == 8 && !type.sign) {
351 intrinsic = "llvm.x86.sse2.pmaxu.b";
352 intr_size = 128;
353 }
354 else if (type.width == 16 && type.sign) {
355 intrinsic = "llvm.x86.sse2.pmaxs.w";
356 }
357 if (util_cpu_caps.has_sse4_1) {
358 if (type.width == 8 && type.sign) {
359 intrinsic = "llvm.x86.sse41.pmaxsb";
360 }
361 if (type.width == 16 && !type.sign) {
362 intrinsic = "llvm.x86.sse41.pmaxuw";
363 }
364 if (type.width == 32 && !type.sign) {
365 intrinsic = "llvm.x86.sse41.pmaxud";
366 }
367 if (type.width == 32 && type.sign) {
368 intrinsic = "llvm.x86.sse41.pmaxsd";
369 }
370 }
371 } else if (util_cpu_caps.has_altivec) {
372 intr_size = 128;
373 if (type.width == 8) {
374 if (!type.sign) {
375 intrinsic = "llvm.ppc.altivec.vmaxub";
376 } else {
377 intrinsic = "llvm.ppc.altivec.vmaxsb";
378 }
379 } else if (type.width == 16) {
380 if (!type.sign) {
381 intrinsic = "llvm.ppc.altivec.vmaxuh";
382 } else {
383 intrinsic = "llvm.ppc.altivec.vmaxsh";
384 }
385 } else if (type.width == 32) {
386 if (!type.sign) {
387 intrinsic = "llvm.ppc.altivec.vmaxuw";
388 } else {
389 intrinsic = "llvm.ppc.altivec.vmaxsw";
390 }
391 }
392 }
393
394 if (intrinsic) {
395 if (util_cpu_caps.has_sse && type.floating &&
396 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
397 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
398 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
399 LLVMValueRef isnan, max;
400 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
401 type,
402 intr_size, a, b);
403 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
404 isnan = lp_build_isnan(bld, b);
405 return lp_build_select(bld, isnan, a, max);
406 } else {
407 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
408 isnan = lp_build_isnan(bld, a);
409 return lp_build_select(bld, isnan, a, max);
410 }
411 } else {
412 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
413 type,
414 intr_size, a, b);
415 }
416 }
417
418 if (type.floating) {
419 switch (nan_behavior) {
420 case GALLIVM_NAN_RETURN_NAN: {
421 LLVMValueRef isnan = lp_build_isnan(bld, b);
422 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
423 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
424 return lp_build_select(bld, cond, a, b);
425 }
426 break;
427 case GALLIVM_NAN_RETURN_OTHER: {
428 LLVMValueRef isnan = lp_build_isnan(bld, a);
429 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
430 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
431 return lp_build_select(bld, cond, a, b);
432 }
433 break;
434 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
435 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
436 return lp_build_select(bld, cond, a, b);
437 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
438 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
439 return lp_build_select(bld, cond, b, a);
440 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
441 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
442 return lp_build_select(bld, cond, a, b);
443 break;
444 default:
445 assert(0);
446 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
447 return lp_build_select(bld, cond, a, b);
448 }
449 } else {
450 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
451 return lp_build_select(bld, cond, a, b);
452 }
453 }
454
455
456 /**
457 * Generate 1 - a, or ~a depending on bld->type.
458 */
459 LLVMValueRef
460 lp_build_comp(struct lp_build_context *bld,
461 LLVMValueRef a)
462 {
463 LLVMBuilderRef builder = bld->gallivm->builder;
464 const struct lp_type type = bld->type;
465
466 assert(lp_check_value(type, a));
467
468 if(a == bld->one)
469 return bld->zero;
470 if(a == bld->zero)
471 return bld->one;
472
473 if(type.norm && !type.floating && !type.fixed && !type.sign) {
474 if(LLVMIsConstant(a))
475 return LLVMConstNot(a);
476 else
477 return LLVMBuildNot(builder, a, "");
478 }
479
480 if(LLVMIsConstant(a))
481 if (type.floating)
482 return LLVMConstFSub(bld->one, a);
483 else
484 return LLVMConstSub(bld->one, a);
485 else
486 if (type.floating)
487 return LLVMBuildFSub(builder, bld->one, a, "");
488 else
489 return LLVMBuildSub(builder, bld->one, a, "");
490 }
491
492
493 /**
494 * Generate a + b
495 */
496 LLVMValueRef
497 lp_build_add(struct lp_build_context *bld,
498 LLVMValueRef a,
499 LLVMValueRef b)
500 {
501 LLVMBuilderRef builder = bld->gallivm->builder;
502 const struct lp_type type = bld->type;
503 LLVMValueRef res;
504
505 assert(lp_check_value(type, a));
506 assert(lp_check_value(type, b));
507
508 if(a == bld->zero)
509 return b;
510 if(b == bld->zero)
511 return a;
512 if(a == bld->undef || b == bld->undef)
513 return bld->undef;
514
515 if(bld->type.norm) {
516 const char *intrinsic = NULL;
517
518 if(a == bld->one || b == bld->one)
519 return bld->one;
520
521 if (type.width * type.length == 128 &&
522 !type.floating && !type.fixed) {
523 if(util_cpu_caps.has_sse2) {
524 if(type.width == 8)
525 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
526 if(type.width == 16)
527 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
528 } else if (util_cpu_caps.has_altivec) {
529 if(type.width == 8)
530 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
531 if(type.width == 16)
532 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
533 }
534 }
535
536 if (intrinsic)
537 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
538 }
539
540 if(type.norm && !type.floating && !type.fixed) {
541 if (type.sign) {
542 uint64_t sign = (uint64_t)1 << (type.width - 1);
543 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
544 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
545 /* a_clamp_max is the maximum a for positive b,
546 a_clamp_min is the minimum a for negative b. */
547 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
548 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
549 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
550 } else {
551 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
552 }
553 }
554
555 if(LLVMIsConstant(a) && LLVMIsConstant(b))
556 if (type.floating)
557 res = LLVMConstFAdd(a, b);
558 else
559 res = LLVMConstAdd(a, b);
560 else
561 if (type.floating)
562 res = LLVMBuildFAdd(builder, a, b, "");
563 else
564 res = LLVMBuildAdd(builder, a, b, "");
565
566 /* clamp to ceiling of 1.0 */
567 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
568 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
569
570 /* XXX clamp to floor of -1 or 0??? */
571
572 return res;
573 }
574
575
576 /** Return the scalar sum of the elements of a.
577 * Should avoid this operation whenever possible.
578 */
579 LLVMValueRef
580 lp_build_horizontal_add(struct lp_build_context *bld,
581 LLVMValueRef a)
582 {
583 LLVMBuilderRef builder = bld->gallivm->builder;
584 const struct lp_type type = bld->type;
585 LLVMValueRef index, res;
586 unsigned i, length;
587 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
588 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
589 LLVMValueRef vecres, elem2;
590
591 assert(lp_check_value(type, a));
592
593 if (type.length == 1) {
594 return a;
595 }
596
597 assert(!bld->type.norm);
598
599 /*
600 * for byte vectors can do much better with psadbw.
601 * Using repeated shuffle/adds here. Note with multiple vectors
602 * this can be done more efficiently as outlined in the intel
603 * optimization manual.
604 * Note: could cause data rearrangement if used with smaller element
605 * sizes.
606 */
607
608 vecres = a;
609 length = type.length / 2;
610 while (length > 1) {
611 LLVMValueRef vec1, vec2;
612 for (i = 0; i < length; i++) {
613 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
614 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
615 }
616 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
617 LLVMConstVector(shuffles1, length), "");
618 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
619 LLVMConstVector(shuffles2, length), "");
620 if (type.floating) {
621 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
622 }
623 else {
624 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
625 }
626 length = length >> 1;
627 }
628
629 /* always have vector of size 2 here */
630 assert(length == 1);
631
632 index = lp_build_const_int32(bld->gallivm, 0);
633 res = LLVMBuildExtractElement(builder, vecres, index, "");
634 index = lp_build_const_int32(bld->gallivm, 1);
635 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
636
637 if (type.floating)
638 res = LLVMBuildFAdd(builder, res, elem2, "");
639 else
640 res = LLVMBuildAdd(builder, res, elem2, "");
641
642 return res;
643 }
644
645 /**
646 * Return the horizontal sums of 4 float vectors as a float4 vector.
647 * This uses the technique as outlined in Intel Optimization Manual.
648 */
649 static LLVMValueRef
650 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
651 LLVMValueRef src[4])
652 {
653 struct gallivm_state *gallivm = bld->gallivm;
654 LLVMBuilderRef builder = gallivm->builder;
655 LLVMValueRef shuffles[4];
656 LLVMValueRef tmp[4];
657 LLVMValueRef sumtmp[2], shuftmp[2];
658
659 /* lower half of regs */
660 shuffles[0] = lp_build_const_int32(gallivm, 0);
661 shuffles[1] = lp_build_const_int32(gallivm, 1);
662 shuffles[2] = lp_build_const_int32(gallivm, 4);
663 shuffles[3] = lp_build_const_int32(gallivm, 5);
664 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
665 LLVMConstVector(shuffles, 4), "");
666 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
667 LLVMConstVector(shuffles, 4), "");
668
669 /* upper half of regs */
670 shuffles[0] = lp_build_const_int32(gallivm, 2);
671 shuffles[1] = lp_build_const_int32(gallivm, 3);
672 shuffles[2] = lp_build_const_int32(gallivm, 6);
673 shuffles[3] = lp_build_const_int32(gallivm, 7);
674 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
675 LLVMConstVector(shuffles, 4), "");
676 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
677 LLVMConstVector(shuffles, 4), "");
678
679 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
680 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
681
682 shuffles[0] = lp_build_const_int32(gallivm, 0);
683 shuffles[1] = lp_build_const_int32(gallivm, 2);
684 shuffles[2] = lp_build_const_int32(gallivm, 4);
685 shuffles[3] = lp_build_const_int32(gallivm, 6);
686 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
687 LLVMConstVector(shuffles, 4), "");
688
689 shuffles[0] = lp_build_const_int32(gallivm, 1);
690 shuffles[1] = lp_build_const_int32(gallivm, 3);
691 shuffles[2] = lp_build_const_int32(gallivm, 5);
692 shuffles[3] = lp_build_const_int32(gallivm, 7);
693 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
694 LLVMConstVector(shuffles, 4), "");
695
696 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
697 }
698
699
700 /*
701 * partially horizontally add 2-4 float vectors with length nx4,
702 * i.e. only four adjacent values in each vector will be added,
703 * assuming values are really grouped in 4 which also determines
704 * output order.
705 *
706 * Return a vector of the same length as the initial vectors,
707 * with the excess elements (if any) being undefined.
708 * The element order is independent of number of input vectors.
709 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
710 * the output order thus will be
711 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
712 */
713 LLVMValueRef
714 lp_build_hadd_partial4(struct lp_build_context *bld,
715 LLVMValueRef vectors[],
716 unsigned num_vecs)
717 {
718 struct gallivm_state *gallivm = bld->gallivm;
719 LLVMBuilderRef builder = gallivm->builder;
720 LLVMValueRef ret_vec;
721 LLVMValueRef tmp[4];
722 const char *intrinsic = NULL;
723
724 assert(num_vecs >= 2 && num_vecs <= 4);
725 assert(bld->type.floating);
726
727 /* only use this with at least 2 vectors, as it is sort of expensive
728 * (depending on cpu) and we always need two horizontal adds anyway,
729 * so a shuffle/add approach might be better.
730 */
731
732 tmp[0] = vectors[0];
733 tmp[1] = vectors[1];
734
735 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
736 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
737
738 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
739 bld->type.length == 4) {
740 intrinsic = "llvm.x86.sse3.hadd.ps";
741 }
742 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
743 bld->type.length == 8) {
744 intrinsic = "llvm.x86.avx.hadd.ps.256";
745 }
746 if (intrinsic) {
747 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
748 lp_build_vec_type(gallivm, bld->type),
749 tmp[0], tmp[1]);
750 if (num_vecs > 2) {
751 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
752 lp_build_vec_type(gallivm, bld->type),
753 tmp[2], tmp[3]);
754 }
755 else {
756 tmp[1] = tmp[0];
757 }
758 return lp_build_intrinsic_binary(builder, intrinsic,
759 lp_build_vec_type(gallivm, bld->type),
760 tmp[0], tmp[1]);
761 }
762
763 if (bld->type.length == 4) {
764 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
765 }
766 else {
767 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
768 unsigned j;
769 unsigned num_iter = bld->type.length / 4;
770 struct lp_type parttype = bld->type;
771 parttype.length = 4;
772 for (j = 0; j < num_iter; j++) {
773 LLVMValueRef partsrc[4];
774 unsigned i;
775 for (i = 0; i < 4; i++) {
776 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
777 }
778 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
779 }
780 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
781 }
782 return ret_vec;
783 }
784
785 /**
786 * Generate a - b
787 */
788 LLVMValueRef
789 lp_build_sub(struct lp_build_context *bld,
790 LLVMValueRef a,
791 LLVMValueRef b)
792 {
793 LLVMBuilderRef builder = bld->gallivm->builder;
794 const struct lp_type type = bld->type;
795 LLVMValueRef res;
796
797 assert(lp_check_value(type, a));
798 assert(lp_check_value(type, b));
799
800 if(b == bld->zero)
801 return a;
802 if(a == bld->undef || b == bld->undef)
803 return bld->undef;
804 if(a == b)
805 return bld->zero;
806
807 if(bld->type.norm) {
808 const char *intrinsic = NULL;
809
810 if(b == bld->one)
811 return bld->zero;
812
813 if (type.width * type.length == 128 &&
814 !type.floating && !type.fixed) {
815 if (util_cpu_caps.has_sse2) {
816 if(type.width == 8)
817 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
818 if(type.width == 16)
819 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
820 } else if (util_cpu_caps.has_altivec) {
821 if(type.width == 8)
822 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
823 if(type.width == 16)
824 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
825 }
826 }
827
828 if (intrinsic)
829 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
830 }
831
832 if(type.norm && !type.floating && !type.fixed) {
833 if (type.sign) {
834 uint64_t sign = (uint64_t)1 << (type.width - 1);
835 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
836 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
837 /* a_clamp_max is the maximum a for negative b,
838 a_clamp_min is the minimum a for positive b. */
839 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
840 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
841 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
842 } else {
843 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
844 }
845 }
846
847 if(LLVMIsConstant(a) && LLVMIsConstant(b))
848 if (type.floating)
849 res = LLVMConstFSub(a, b);
850 else
851 res = LLVMConstSub(a, b);
852 else
853 if (type.floating)
854 res = LLVMBuildFSub(builder, a, b, "");
855 else
856 res = LLVMBuildSub(builder, a, b, "");
857
858 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
859 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
860
861 return res;
862 }
863
864
865
866 /**
867 * Normalized multiplication.
868 *
869 * There are several approaches for (using 8-bit normalized multiplication as
870 * an example):
871 *
872 * - alpha plus one
873 *
874 * makes the following approximation to the division (Sree)
875 *
876 * a*b/255 ~= (a*(b + 1)) >> 256
877 *
878 * which is the fastest method that satisfies the following OpenGL criteria of
879 *
880 * 0*0 = 0 and 255*255 = 255
881 *
882 * - geometric series
883 *
884 * takes the geometric series approximation to the division
885 *
886 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
887 *
888 * in this case just the first two terms to fit in 16bit arithmetic
889 *
890 * t/255 ~= (t + (t >> 8)) >> 8
891 *
892 * note that just by itself it doesn't satisfies the OpenGL criteria, as
893 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
894 * must be used.
895 *
896 * - geometric series plus rounding
897 *
898 * when using a geometric series division instead of truncating the result
899 * use roundoff in the approximation (Jim Blinn)
900 *
901 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
902 *
903 * achieving the exact results.
904 *
905 *
906 *
907 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
908 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
909 * @sa Michael Herf, The "double blend trick", May 2000,
910 * http://www.stereopsis.com/doubleblend.html
911 */
912 static LLVMValueRef
913 lp_build_mul_norm(struct gallivm_state *gallivm,
914 struct lp_type wide_type,
915 LLVMValueRef a, LLVMValueRef b)
916 {
917 LLVMBuilderRef builder = gallivm->builder;
918 struct lp_build_context bld;
919 unsigned n;
920 LLVMValueRef half;
921 LLVMValueRef ab;
922
923 assert(!wide_type.floating);
924 assert(lp_check_value(wide_type, a));
925 assert(lp_check_value(wide_type, b));
926
927 lp_build_context_init(&bld, gallivm, wide_type);
928
929 n = wide_type.width / 2;
930 if (wide_type.sign) {
931 --n;
932 }
933
934 /*
935 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
936 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
937 */
938
939 /*
940 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
941 */
942
943 ab = LLVMBuildMul(builder, a, b, "");
944 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
945
946 /*
947 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
948 */
949
950 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
951 if (wide_type.sign) {
952 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
953 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
954 half = lp_build_select(&bld, sign, minus_half, half);
955 }
956 ab = LLVMBuildAdd(builder, ab, half, "");
957
958 /* Final division */
959 ab = lp_build_shr_imm(&bld, ab, n);
960
961 return ab;
962 }
963
964 /**
965 * Generate a * b
966 */
967 LLVMValueRef
968 lp_build_mul(struct lp_build_context *bld,
969 LLVMValueRef a,
970 LLVMValueRef b)
971 {
972 LLVMBuilderRef builder = bld->gallivm->builder;
973 const struct lp_type type = bld->type;
974 LLVMValueRef shift;
975 LLVMValueRef res;
976
977 assert(lp_check_value(type, a));
978 assert(lp_check_value(type, b));
979
980 if(a == bld->zero)
981 return bld->zero;
982 if(a == bld->one)
983 return b;
984 if(b == bld->zero)
985 return bld->zero;
986 if(b == bld->one)
987 return a;
988 if(a == bld->undef || b == bld->undef)
989 return bld->undef;
990
991 if (!type.floating && !type.fixed && type.norm) {
992 struct lp_type wide_type = lp_wider_type(type);
993 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
994
995 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
996 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
997
998 /* PMULLW, PSRLW, PADDW */
999 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1000 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1001
1002 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
1003
1004 return ab;
1005 }
1006
1007 if(type.fixed)
1008 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1009 else
1010 shift = NULL;
1011
1012 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1013 if (type.floating)
1014 res = LLVMConstFMul(a, b);
1015 else
1016 res = LLVMConstMul(a, b);
1017 if(shift) {
1018 if(type.sign)
1019 res = LLVMConstAShr(res, shift);
1020 else
1021 res = LLVMConstLShr(res, shift);
1022 }
1023 }
1024 else {
1025 if (type.floating)
1026 res = LLVMBuildFMul(builder, a, b, "");
1027 else
1028 res = LLVMBuildMul(builder, a, b, "");
1029 if(shift) {
1030 if(type.sign)
1031 res = LLVMBuildAShr(builder, res, shift, "");
1032 else
1033 res = LLVMBuildLShr(builder, res, shift, "");
1034 }
1035 }
1036
1037 return res;
1038 }
1039
1040
1041 /* a * b + c */
1042 LLVMValueRef
1043 lp_build_mad(struct lp_build_context *bld,
1044 LLVMValueRef a,
1045 LLVMValueRef b,
1046 LLVMValueRef c)
1047 {
1048 const struct lp_type type = bld->type;
1049 if (type.floating) {
1050 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1051 } else {
1052 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1053 }
1054 }
1055
1056
1057 /**
1058 * Small vector x scale multiplication optimization.
1059 */
1060 LLVMValueRef
1061 lp_build_mul_imm(struct lp_build_context *bld,
1062 LLVMValueRef a,
1063 int b)
1064 {
1065 LLVMBuilderRef builder = bld->gallivm->builder;
1066 LLVMValueRef factor;
1067
1068 assert(lp_check_value(bld->type, a));
1069
1070 if(b == 0)
1071 return bld->zero;
1072
1073 if(b == 1)
1074 return a;
1075
1076 if(b == -1)
1077 return lp_build_negate(bld, a);
1078
1079 if(b == 2 && bld->type.floating)
1080 return lp_build_add(bld, a, a);
1081
1082 if(util_is_power_of_two(b)) {
1083 unsigned shift = ffs(b) - 1;
1084
1085 if(bld->type.floating) {
1086 #if 0
1087 /*
1088 * Power of two multiplication by directly manipulating the exponent.
1089 *
1090 * XXX: This might not be always faster, it will introduce a small error
1091 * for multiplication by zero, and it will produce wrong results
1092 * for Inf and NaN.
1093 */
1094 unsigned mantissa = lp_mantissa(bld->type);
1095 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1096 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1097 a = LLVMBuildAdd(builder, a, factor, "");
1098 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1099 return a;
1100 #endif
1101 }
1102 else {
1103 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1104 return LLVMBuildShl(builder, a, factor, "");
1105 }
1106 }
1107
1108 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1109 return lp_build_mul(bld, a, factor);
1110 }
1111
1112
1113 /**
1114 * Generate a / b
1115 */
1116 LLVMValueRef
1117 lp_build_div(struct lp_build_context *bld,
1118 LLVMValueRef a,
1119 LLVMValueRef b)
1120 {
1121 LLVMBuilderRef builder = bld->gallivm->builder;
1122 const struct lp_type type = bld->type;
1123
1124 assert(lp_check_value(type, a));
1125 assert(lp_check_value(type, b));
1126
1127 if(a == bld->zero)
1128 return bld->zero;
1129 if(a == bld->one && type.floating)
1130 return lp_build_rcp(bld, b);
1131 if(b == bld->zero)
1132 return bld->undef;
1133 if(b == bld->one)
1134 return a;
1135 if(a == bld->undef || b == bld->undef)
1136 return bld->undef;
1137
1138 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1139 if (type.floating)
1140 return LLVMConstFDiv(a, b);
1141 else if (type.sign)
1142 return LLVMConstSDiv(a, b);
1143 else
1144 return LLVMConstUDiv(a, b);
1145 }
1146
1147 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1148 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1149 type.floating)
1150 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1151
1152 if (type.floating)
1153 return LLVMBuildFDiv(builder, a, b, "");
1154 else if (type.sign)
1155 return LLVMBuildSDiv(builder, a, b, "");
1156 else
1157 return LLVMBuildUDiv(builder, a, b, "");
1158 }
1159
1160
1161 /**
1162 * Linear interpolation helper.
1163 *
1164 * @param normalized whether we are interpolating normalized values,
1165 * encoded in normalized integers, twice as wide.
1166 *
1167 * @sa http://www.stereopsis.com/doubleblend.html
1168 */
1169 static inline LLVMValueRef
1170 lp_build_lerp_simple(struct lp_build_context *bld,
1171 LLVMValueRef x,
1172 LLVMValueRef v0,
1173 LLVMValueRef v1,
1174 unsigned flags)
1175 {
1176 unsigned half_width = bld->type.width/2;
1177 LLVMBuilderRef builder = bld->gallivm->builder;
1178 LLVMValueRef delta;
1179 LLVMValueRef res;
1180
1181 assert(lp_check_value(bld->type, x));
1182 assert(lp_check_value(bld->type, v0));
1183 assert(lp_check_value(bld->type, v1));
1184
1185 delta = lp_build_sub(bld, v1, v0);
1186
1187 if (bld->type.floating) {
1188 assert(flags == 0);
1189 return lp_build_mad(bld, x, delta, v0);
1190 }
1191
1192 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1193 if (!bld->type.sign) {
1194 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1195 /*
1196 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1197 * most-significant-bit to the lowest-significant-bit, so that
1198 * later we can just divide by 2**n instead of 2**n - 1.
1199 */
1200
1201 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1202 }
1203
1204 /* (x * delta) >> n */
1205 res = lp_build_mul(bld, x, delta);
1206 res = lp_build_shr_imm(bld, res, half_width);
1207 } else {
1208 /*
1209 * The rescaling trick above doesn't work for signed numbers, so
1210 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1211 * instead.
1212 */
1213 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1214 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1215 }
1216 } else {
1217 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1218 res = lp_build_mul(bld, x, delta);
1219 }
1220
1221 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1222 /*
1223 * At this point both res and v0 only use the lower half of the bits,
1224 * the rest is zero. Instead of add / mask, do add with half wide type.
1225 */
1226 struct lp_type narrow_type;
1227 struct lp_build_context narrow_bld;
1228
1229 memset(&narrow_type, 0, sizeof narrow_type);
1230 narrow_type.sign = bld->type.sign;
1231 narrow_type.width = bld->type.width/2;
1232 narrow_type.length = bld->type.length*2;
1233
1234 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1235 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1236 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1237 res = lp_build_add(&narrow_bld, v0, res);
1238 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1239 } else {
1240 res = lp_build_add(bld, v0, res);
1241
1242 if (bld->type.fixed) {
1243 /*
1244 * We need to mask out the high order bits when lerping 8bit
1245 * normalized colors stored on 16bits
1246 */
1247 /* XXX: This step is necessary for lerping 8bit colors stored on
1248 * 16bits, but it will be wrong for true fixed point use cases.
1249 * Basically we need a more powerful lp_type, capable of further
1250 * distinguishing the values interpretation from the value storage.
1251 */
1252 LLVMValueRef low_bits;
1253 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1254 res = LLVMBuildAnd(builder, res, low_bits, "");
1255 }
1256 }
1257
1258 return res;
1259 }
1260
1261
1262 /**
1263 * Linear interpolation.
1264 */
1265 LLVMValueRef
1266 lp_build_lerp(struct lp_build_context *bld,
1267 LLVMValueRef x,
1268 LLVMValueRef v0,
1269 LLVMValueRef v1,
1270 unsigned flags)
1271 {
1272 const struct lp_type type = bld->type;
1273 LLVMValueRef res;
1274
1275 assert(lp_check_value(type, x));
1276 assert(lp_check_value(type, v0));
1277 assert(lp_check_value(type, v1));
1278
1279 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1280
1281 if (type.norm) {
1282 struct lp_type wide_type;
1283 struct lp_build_context wide_bld;
1284 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1285
1286 assert(type.length >= 2);
1287
1288 /*
1289 * Create a wider integer type, enough to hold the
1290 * intermediate result of the multiplication.
1291 */
1292 memset(&wide_type, 0, sizeof wide_type);
1293 wide_type.sign = type.sign;
1294 wide_type.width = type.width*2;
1295 wide_type.length = type.length/2;
1296
1297 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1298
1299 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1300 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1301 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1302
1303 /*
1304 * Lerp both halves.
1305 */
1306
1307 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1308
1309 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1310 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1311
1312 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1313 } else {
1314 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1315 }
1316
1317 return res;
1318 }
1319
1320
1321 /**
1322 * Bilinear interpolation.
1323 *
1324 * Values indices are in v_{yx}.
1325 */
1326 LLVMValueRef
1327 lp_build_lerp_2d(struct lp_build_context *bld,
1328 LLVMValueRef x,
1329 LLVMValueRef y,
1330 LLVMValueRef v00,
1331 LLVMValueRef v01,
1332 LLVMValueRef v10,
1333 LLVMValueRef v11,
1334 unsigned flags)
1335 {
1336 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1337 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1338 return lp_build_lerp(bld, y, v0, v1, flags);
1339 }
1340
1341
1342 LLVMValueRef
1343 lp_build_lerp_3d(struct lp_build_context *bld,
1344 LLVMValueRef x,
1345 LLVMValueRef y,
1346 LLVMValueRef z,
1347 LLVMValueRef v000,
1348 LLVMValueRef v001,
1349 LLVMValueRef v010,
1350 LLVMValueRef v011,
1351 LLVMValueRef v100,
1352 LLVMValueRef v101,
1353 LLVMValueRef v110,
1354 LLVMValueRef v111,
1355 unsigned flags)
1356 {
1357 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1358 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1359 return lp_build_lerp(bld, z, v0, v1, flags);
1360 }
1361
1362
1363 /**
1364 * Generate min(a, b)
1365 * Do checks for special cases but not for nans.
1366 */
1367 LLVMValueRef
1368 lp_build_min(struct lp_build_context *bld,
1369 LLVMValueRef a,
1370 LLVMValueRef b)
1371 {
1372 assert(lp_check_value(bld->type, a));
1373 assert(lp_check_value(bld->type, b));
1374
1375 if(a == bld->undef || b == bld->undef)
1376 return bld->undef;
1377
1378 if(a == b)
1379 return a;
1380
1381 if (bld->type.norm) {
1382 if (!bld->type.sign) {
1383 if (a == bld->zero || b == bld->zero) {
1384 return bld->zero;
1385 }
1386 }
1387 if(a == bld->one)
1388 return b;
1389 if(b == bld->one)
1390 return a;
1391 }
1392
1393 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1394 }
1395
1396
1397 /**
1398 * Generate min(a, b)
1399 * NaN's are handled according to the behavior specified by the
1400 * nan_behavior argument.
1401 */
1402 LLVMValueRef
1403 lp_build_min_ext(struct lp_build_context *bld,
1404 LLVMValueRef a,
1405 LLVMValueRef b,
1406 enum gallivm_nan_behavior nan_behavior)
1407 {
1408 assert(lp_check_value(bld->type, a));
1409 assert(lp_check_value(bld->type, b));
1410
1411 if(a == bld->undef || b == bld->undef)
1412 return bld->undef;
1413
1414 if(a == b)
1415 return a;
1416
1417 if (bld->type.norm) {
1418 if (!bld->type.sign) {
1419 if (a == bld->zero || b == bld->zero) {
1420 return bld->zero;
1421 }
1422 }
1423 if(a == bld->one)
1424 return b;
1425 if(b == bld->one)
1426 return a;
1427 }
1428
1429 return lp_build_min_simple(bld, a, b, nan_behavior);
1430 }
1431
1432 /**
1433 * Generate max(a, b)
1434 * Do checks for special cases, but NaN behavior is undefined.
1435 */
1436 LLVMValueRef
1437 lp_build_max(struct lp_build_context *bld,
1438 LLVMValueRef a,
1439 LLVMValueRef b)
1440 {
1441 assert(lp_check_value(bld->type, a));
1442 assert(lp_check_value(bld->type, b));
1443
1444 if(a == bld->undef || b == bld->undef)
1445 return bld->undef;
1446
1447 if(a == b)
1448 return a;
1449
1450 if(bld->type.norm) {
1451 if(a == bld->one || b == bld->one)
1452 return bld->one;
1453 if (!bld->type.sign) {
1454 if (a == bld->zero) {
1455 return b;
1456 }
1457 if (b == bld->zero) {
1458 return a;
1459 }
1460 }
1461 }
1462
1463 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1464 }
1465
1466
1467 /**
1468 * Generate max(a, b)
1469 * Checks for special cases.
1470 * NaN's are handled according to the behavior specified by the
1471 * nan_behavior argument.
1472 */
1473 LLVMValueRef
1474 lp_build_max_ext(struct lp_build_context *bld,
1475 LLVMValueRef a,
1476 LLVMValueRef b,
1477 enum gallivm_nan_behavior nan_behavior)
1478 {
1479 assert(lp_check_value(bld->type, a));
1480 assert(lp_check_value(bld->type, b));
1481
1482 if(a == bld->undef || b == bld->undef)
1483 return bld->undef;
1484
1485 if(a == b)
1486 return a;
1487
1488 if(bld->type.norm) {
1489 if(a == bld->one || b == bld->one)
1490 return bld->one;
1491 if (!bld->type.sign) {
1492 if (a == bld->zero) {
1493 return b;
1494 }
1495 if (b == bld->zero) {
1496 return a;
1497 }
1498 }
1499 }
1500
1501 return lp_build_max_simple(bld, a, b, nan_behavior);
1502 }
1503
1504 /**
1505 * Generate clamp(a, min, max)
1506 * NaN behavior (for any of a, min, max) is undefined.
1507 * Do checks for special cases.
1508 */
1509 LLVMValueRef
1510 lp_build_clamp(struct lp_build_context *bld,
1511 LLVMValueRef a,
1512 LLVMValueRef min,
1513 LLVMValueRef max)
1514 {
1515 assert(lp_check_value(bld->type, a));
1516 assert(lp_check_value(bld->type, min));
1517 assert(lp_check_value(bld->type, max));
1518
1519 a = lp_build_min(bld, a, max);
1520 a = lp_build_max(bld, a, min);
1521 return a;
1522 }
1523
1524
1525 /**
1526 * Generate clamp(a, 0, 1)
1527 * A NaN will get converted to zero.
1528 */
1529 LLVMValueRef
1530 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1531 LLVMValueRef a)
1532 {
1533 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1534 a = lp_build_min(bld, a, bld->one);
1535 return a;
1536 }
1537
1538
1539 /**
1540 * Generate abs(a)
1541 */
1542 LLVMValueRef
1543 lp_build_abs(struct lp_build_context *bld,
1544 LLVMValueRef a)
1545 {
1546 LLVMBuilderRef builder = bld->gallivm->builder;
1547 const struct lp_type type = bld->type;
1548 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1549
1550 assert(lp_check_value(type, a));
1551
1552 if(!type.sign)
1553 return a;
1554
1555 if(type.floating) {
1556 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1557 /* Workaround llvm.org/PR27332 */
1558 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1559 unsigned long long absMask = ~(1ULL << (type.width - 1));
1560 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1561 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1562 a = LLVMBuildAnd(builder, a, mask, "");
1563 a = LLVMBuildBitCast(builder, a, vec_type, "");
1564 return a;
1565 } else {
1566 char intrinsic[32];
1567 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1568 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1569 }
1570 }
1571
1572 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1573 switch(type.width) {
1574 case 8:
1575 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1576 case 16:
1577 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1578 case 32:
1579 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1580 }
1581 }
1582 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1583 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1584 (type.width == 8 || type.width == 16 || type.width == 32)) {
1585 debug_printf("%s: inefficient code, should split vectors manually\n",
1586 __FUNCTION__);
1587 }
1588
1589 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1590 }
1591
1592
1593 LLVMValueRef
1594 lp_build_negate(struct lp_build_context *bld,
1595 LLVMValueRef a)
1596 {
1597 LLVMBuilderRef builder = bld->gallivm->builder;
1598
1599 assert(lp_check_value(bld->type, a));
1600
1601 if (bld->type.floating)
1602 a = LLVMBuildFNeg(builder, a, "");
1603 else
1604 a = LLVMBuildNeg(builder, a, "");
1605
1606 return a;
1607 }
1608
1609
1610 /** Return -1, 0 or +1 depending on the sign of a */
1611 LLVMValueRef
1612 lp_build_sgn(struct lp_build_context *bld,
1613 LLVMValueRef a)
1614 {
1615 LLVMBuilderRef builder = bld->gallivm->builder;
1616 const struct lp_type type = bld->type;
1617 LLVMValueRef cond;
1618 LLVMValueRef res;
1619
1620 assert(lp_check_value(type, a));
1621
1622 /* Handle non-zero case */
1623 if(!type.sign) {
1624 /* if not zero then sign must be positive */
1625 res = bld->one;
1626 }
1627 else if(type.floating) {
1628 LLVMTypeRef vec_type;
1629 LLVMTypeRef int_type;
1630 LLVMValueRef mask;
1631 LLVMValueRef sign;
1632 LLVMValueRef one;
1633 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1634
1635 int_type = lp_build_int_vec_type(bld->gallivm, type);
1636 vec_type = lp_build_vec_type(bld->gallivm, type);
1637 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1638
1639 /* Take the sign bit and add it to 1 constant */
1640 sign = LLVMBuildBitCast(builder, a, int_type, "");
1641 sign = LLVMBuildAnd(builder, sign, mask, "");
1642 one = LLVMConstBitCast(bld->one, int_type);
1643 res = LLVMBuildOr(builder, sign, one, "");
1644 res = LLVMBuildBitCast(builder, res, vec_type, "");
1645 }
1646 else
1647 {
1648 /* signed int/norm/fixed point */
1649 /* could use psign with sse3 and appropriate vectors here */
1650 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1651 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1652 res = lp_build_select(bld, cond, bld->one, minus_one);
1653 }
1654
1655 /* Handle zero */
1656 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1657 res = lp_build_select(bld, cond, bld->zero, res);
1658
1659 return res;
1660 }
1661
1662
1663 /**
1664 * Set the sign of float vector 'a' according to 'sign'.
1665 * If sign==0, return abs(a).
1666 * If sign==1, return -abs(a);
1667 * Other values for sign produce undefined results.
1668 */
1669 LLVMValueRef
1670 lp_build_set_sign(struct lp_build_context *bld,
1671 LLVMValueRef a, LLVMValueRef sign)
1672 {
1673 LLVMBuilderRef builder = bld->gallivm->builder;
1674 const struct lp_type type = bld->type;
1675 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1676 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1677 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1678 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1679 ~((unsigned long long) 1 << (type.width - 1)));
1680 LLVMValueRef val, res;
1681
1682 assert(type.floating);
1683 assert(lp_check_value(type, a));
1684
1685 /* val = reinterpret_cast<int>(a) */
1686 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1687 /* val = val & mask */
1688 val = LLVMBuildAnd(builder, val, mask, "");
1689 /* sign = sign << shift */
1690 sign = LLVMBuildShl(builder, sign, shift, "");
1691 /* res = val | sign */
1692 res = LLVMBuildOr(builder, val, sign, "");
1693 /* res = reinterpret_cast<float>(res) */
1694 res = LLVMBuildBitCast(builder, res, vec_type, "");
1695
1696 return res;
1697 }
1698
1699
1700 /**
1701 * Convert vector of (or scalar) int to vector of (or scalar) float.
1702 */
1703 LLVMValueRef
1704 lp_build_int_to_float(struct lp_build_context *bld,
1705 LLVMValueRef a)
1706 {
1707 LLVMBuilderRef builder = bld->gallivm->builder;
1708 const struct lp_type type = bld->type;
1709 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1710
1711 assert(type.floating);
1712
1713 return LLVMBuildSIToFP(builder, a, vec_type, "");
1714 }
1715
1716 static boolean
1717 arch_rounding_available(const struct lp_type type)
1718 {
1719 if ((util_cpu_caps.has_sse4_1 &&
1720 (type.length == 1 || type.width*type.length == 128)) ||
1721 (util_cpu_caps.has_avx && type.width*type.length == 256))
1722 return TRUE;
1723 else if ((util_cpu_caps.has_altivec &&
1724 (type.width == 32 && type.length == 4)))
1725 return TRUE;
1726
1727 return FALSE;
1728 }
1729
1730 enum lp_build_round_mode
1731 {
1732 LP_BUILD_ROUND_NEAREST = 0,
1733 LP_BUILD_ROUND_FLOOR = 1,
1734 LP_BUILD_ROUND_CEIL = 2,
1735 LP_BUILD_ROUND_TRUNCATE = 3
1736 };
1737
1738 static inline LLVMValueRef
1739 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1740 LLVMValueRef a)
1741 {
1742 LLVMBuilderRef builder = bld->gallivm->builder;
1743 const struct lp_type type = bld->type;
1744 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1745 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1746 const char *intrinsic;
1747 LLVMValueRef res;
1748
1749 assert(type.floating);
1750 /* using the double precision conversions is a bit more complicated */
1751 assert(type.width == 32);
1752
1753 assert(lp_check_value(type, a));
1754 assert(util_cpu_caps.has_sse2);
1755
1756 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1757 if (type.length == 1) {
1758 LLVMTypeRef vec_type;
1759 LLVMValueRef undef;
1760 LLVMValueRef arg;
1761 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1762
1763 vec_type = LLVMVectorType(bld->elem_type, 4);
1764
1765 intrinsic = "llvm.x86.sse.cvtss2si";
1766
1767 undef = LLVMGetUndef(vec_type);
1768
1769 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1770
1771 res = lp_build_intrinsic_unary(builder, intrinsic,
1772 ret_type, arg);
1773 }
1774 else {
1775 if (type.width* type.length == 128) {
1776 intrinsic = "llvm.x86.sse2.cvtps2dq";
1777 }
1778 else {
1779 assert(type.width*type.length == 256);
1780 assert(util_cpu_caps.has_avx);
1781
1782 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1783 }
1784 res = lp_build_intrinsic_unary(builder, intrinsic,
1785 ret_type, a);
1786 }
1787
1788 return res;
1789 }
1790
1791
1792 /*
1793 */
1794 static inline LLVMValueRef
1795 lp_build_round_altivec(struct lp_build_context *bld,
1796 LLVMValueRef a,
1797 enum lp_build_round_mode mode)
1798 {
1799 LLVMBuilderRef builder = bld->gallivm->builder;
1800 const struct lp_type type = bld->type;
1801 const char *intrinsic = NULL;
1802
1803 assert(type.floating);
1804
1805 assert(lp_check_value(type, a));
1806 assert(util_cpu_caps.has_altivec);
1807
1808 (void)type;
1809
1810 switch (mode) {
1811 case LP_BUILD_ROUND_NEAREST:
1812 intrinsic = "llvm.ppc.altivec.vrfin";
1813 break;
1814 case LP_BUILD_ROUND_FLOOR:
1815 intrinsic = "llvm.ppc.altivec.vrfim";
1816 break;
1817 case LP_BUILD_ROUND_CEIL:
1818 intrinsic = "llvm.ppc.altivec.vrfip";
1819 break;
1820 case LP_BUILD_ROUND_TRUNCATE:
1821 intrinsic = "llvm.ppc.altivec.vrfiz";
1822 break;
1823 }
1824
1825 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1826 }
1827
1828 static inline LLVMValueRef
1829 lp_build_round_arch(struct lp_build_context *bld,
1830 LLVMValueRef a,
1831 enum lp_build_round_mode mode)
1832 {
1833 if (util_cpu_caps.has_sse4_1) {
1834 LLVMBuilderRef builder = bld->gallivm->builder;
1835 const struct lp_type type = bld->type;
1836 const char *intrinsic_root;
1837 char intrinsic[32];
1838
1839 assert(type.floating);
1840 assert(lp_check_value(type, a));
1841 (void)type;
1842
1843 switch (mode) {
1844 case LP_BUILD_ROUND_NEAREST:
1845 intrinsic_root = "llvm.nearbyint";
1846 break;
1847 case LP_BUILD_ROUND_FLOOR:
1848 intrinsic_root = "llvm.floor";
1849 break;
1850 case LP_BUILD_ROUND_CEIL:
1851 intrinsic_root = "llvm.ceil";
1852 break;
1853 case LP_BUILD_ROUND_TRUNCATE:
1854 intrinsic_root = "llvm.trunc";
1855 break;
1856 }
1857
1858 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
1859 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1860 }
1861 else /* (util_cpu_caps.has_altivec) */
1862 return lp_build_round_altivec(bld, a, mode);
1863 }
1864
1865 /**
1866 * Return the integer part of a float (vector) value (== round toward zero).
1867 * The returned value is a float (vector).
1868 * Ex: trunc(-1.5) = -1.0
1869 */
1870 LLVMValueRef
1871 lp_build_trunc(struct lp_build_context *bld,
1872 LLVMValueRef a)
1873 {
1874 LLVMBuilderRef builder = bld->gallivm->builder;
1875 const struct lp_type type = bld->type;
1876
1877 assert(type.floating);
1878 assert(lp_check_value(type, a));
1879
1880 if (arch_rounding_available(type)) {
1881 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1882 }
1883 else {
1884 const struct lp_type type = bld->type;
1885 struct lp_type inttype;
1886 struct lp_build_context intbld;
1887 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1888 LLVMValueRef trunc, res, anosign, mask;
1889 LLVMTypeRef int_vec_type = bld->int_vec_type;
1890 LLVMTypeRef vec_type = bld->vec_type;
1891
1892 assert(type.width == 32); /* might want to handle doubles at some point */
1893
1894 inttype = type;
1895 inttype.floating = 0;
1896 lp_build_context_init(&intbld, bld->gallivm, inttype);
1897
1898 /* round by truncation */
1899 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1900 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1901
1902 /* mask out sign bit */
1903 anosign = lp_build_abs(bld, a);
1904 /*
1905 * mask out all values if anosign > 2^24
1906 * This should work both for large ints (all rounding is no-op for them
1907 * because such floats are always exact) as well as special cases like
1908 * NaNs, Infs (taking advantage of the fact they use max exponent).
1909 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1910 */
1911 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1912 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1913 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1914 return lp_build_select(bld, mask, a, res);
1915 }
1916 }
1917
1918
1919 /**
1920 * Return float (vector) rounded to nearest integer (vector). The returned
1921 * value is a float (vector).
1922 * Ex: round(0.9) = 1.0
1923 * Ex: round(-1.5) = -2.0
1924 */
1925 LLVMValueRef
1926 lp_build_round(struct lp_build_context *bld,
1927 LLVMValueRef a)
1928 {
1929 LLVMBuilderRef builder = bld->gallivm->builder;
1930 const struct lp_type type = bld->type;
1931
1932 assert(type.floating);
1933 assert(lp_check_value(type, a));
1934
1935 if (arch_rounding_available(type)) {
1936 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1937 }
1938 else {
1939 const struct lp_type type = bld->type;
1940 struct lp_type inttype;
1941 struct lp_build_context intbld;
1942 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1943 LLVMValueRef res, anosign, mask;
1944 LLVMTypeRef int_vec_type = bld->int_vec_type;
1945 LLVMTypeRef vec_type = bld->vec_type;
1946
1947 assert(type.width == 32); /* might want to handle doubles at some point */
1948
1949 inttype = type;
1950 inttype.floating = 0;
1951 lp_build_context_init(&intbld, bld->gallivm, inttype);
1952
1953 res = lp_build_iround(bld, a);
1954 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1955
1956 /* mask out sign bit */
1957 anosign = lp_build_abs(bld, a);
1958 /*
1959 * mask out all values if anosign > 2^24
1960 * This should work both for large ints (all rounding is no-op for them
1961 * because such floats are always exact) as well as special cases like
1962 * NaNs, Infs (taking advantage of the fact they use max exponent).
1963 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1964 */
1965 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1966 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1967 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1968 return lp_build_select(bld, mask, a, res);
1969 }
1970 }
1971
1972
1973 /**
1974 * Return floor of float (vector), result is a float (vector)
1975 * Ex: floor(1.1) = 1.0
1976 * Ex: floor(-1.1) = -2.0
1977 */
1978 LLVMValueRef
1979 lp_build_floor(struct lp_build_context *bld,
1980 LLVMValueRef a)
1981 {
1982 LLVMBuilderRef builder = bld->gallivm->builder;
1983 const struct lp_type type = bld->type;
1984
1985 assert(type.floating);
1986 assert(lp_check_value(type, a));
1987
1988 if (arch_rounding_available(type)) {
1989 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1990 }
1991 else {
1992 const struct lp_type type = bld->type;
1993 struct lp_type inttype;
1994 struct lp_build_context intbld;
1995 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
1996 LLVMValueRef trunc, res, anosign, mask;
1997 LLVMTypeRef int_vec_type = bld->int_vec_type;
1998 LLVMTypeRef vec_type = bld->vec_type;
1999
2000 if (type.width != 32) {
2001 char intrinsic[32];
2002 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2003 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2004 }
2005
2006 assert(type.width == 32); /* might want to handle doubles at some point */
2007
2008 inttype = type;
2009 inttype.floating = 0;
2010 lp_build_context_init(&intbld, bld->gallivm, inttype);
2011
2012 /* round by truncation */
2013 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2014 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2015
2016 if (type.sign) {
2017 LLVMValueRef tmp;
2018
2019 /*
2020 * fix values if rounding is wrong (for non-special cases)
2021 * - this is the case if trunc > a
2022 */
2023 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2024 /* tmp = trunc > a ? 1.0 : 0.0 */
2025 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2026 tmp = lp_build_and(&intbld, mask, tmp);
2027 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2028 res = lp_build_sub(bld, res, tmp);
2029 }
2030
2031 /* mask out sign bit */
2032 anosign = lp_build_abs(bld, a);
2033 /*
2034 * mask out all values if anosign > 2^24
2035 * This should work both for large ints (all rounding is no-op for them
2036 * because such floats are always exact) as well as special cases like
2037 * NaNs, Infs (taking advantage of the fact they use max exponent).
2038 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2039 */
2040 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2041 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2042 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2043 return lp_build_select(bld, mask, a, res);
2044 }
2045 }
2046
2047
2048 /**
2049 * Return ceiling of float (vector), returning float (vector).
2050 * Ex: ceil( 1.1) = 2.0
2051 * Ex: ceil(-1.1) = -1.0
2052 */
2053 LLVMValueRef
2054 lp_build_ceil(struct lp_build_context *bld,
2055 LLVMValueRef a)
2056 {
2057 LLVMBuilderRef builder = bld->gallivm->builder;
2058 const struct lp_type type = bld->type;
2059
2060 assert(type.floating);
2061 assert(lp_check_value(type, a));
2062
2063 if (arch_rounding_available(type)) {
2064 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2065 }
2066 else {
2067 const struct lp_type type = bld->type;
2068 struct lp_type inttype;
2069 struct lp_build_context intbld;
2070 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2071 LLVMValueRef trunc, res, anosign, mask, tmp;
2072 LLVMTypeRef int_vec_type = bld->int_vec_type;
2073 LLVMTypeRef vec_type = bld->vec_type;
2074
2075 if (type.width != 32) {
2076 char intrinsic[32];
2077 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2078 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2079 }
2080
2081 assert(type.width == 32); /* might want to handle doubles at some point */
2082
2083 inttype = type;
2084 inttype.floating = 0;
2085 lp_build_context_init(&intbld, bld->gallivm, inttype);
2086
2087 /* round by truncation */
2088 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2089 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2090
2091 /*
2092 * fix values if rounding is wrong (for non-special cases)
2093 * - this is the case if trunc < a
2094 */
2095 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2096 /* tmp = trunc < a ? 1.0 : 0.0 */
2097 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2098 tmp = lp_build_and(&intbld, mask, tmp);
2099 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2100 res = lp_build_add(bld, trunc, tmp);
2101
2102 /* mask out sign bit */
2103 anosign = lp_build_abs(bld, a);
2104 /*
2105 * mask out all values if anosign > 2^24
2106 * This should work both for large ints (all rounding is no-op for them
2107 * because such floats are always exact) as well as special cases like
2108 * NaNs, Infs (taking advantage of the fact they use max exponent).
2109 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2110 */
2111 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2112 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2113 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2114 return lp_build_select(bld, mask, a, res);
2115 }
2116 }
2117
2118
2119 /**
2120 * Return fractional part of 'a' computed as a - floor(a)
2121 * Typically used in texture coord arithmetic.
2122 */
2123 LLVMValueRef
2124 lp_build_fract(struct lp_build_context *bld,
2125 LLVMValueRef a)
2126 {
2127 assert(bld->type.floating);
2128 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2129 }
2130
2131
2132 /**
2133 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2134 * against 0.99999(9). (Will also return that value for NaNs.)
2135 */
2136 static inline LLVMValueRef
2137 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2138 {
2139 LLVMValueRef max;
2140
2141 /* this is the largest number smaller than 1.0 representable as float */
2142 max = lp_build_const_vec(bld->gallivm, bld->type,
2143 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2144 return lp_build_min_ext(bld, fract, max,
2145 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2146 }
2147
2148
2149 /**
2150 * Same as lp_build_fract, but guarantees that the result is always smaller
2151 * than one. Will also return the smaller-than-one value for infs, NaNs.
2152 */
2153 LLVMValueRef
2154 lp_build_fract_safe(struct lp_build_context *bld,
2155 LLVMValueRef a)
2156 {
2157 return clamp_fract(bld, lp_build_fract(bld, a));
2158 }
2159
2160
2161 /**
2162 * Return the integer part of a float (vector) value (== round toward zero).
2163 * The returned value is an integer (vector).
2164 * Ex: itrunc(-1.5) = -1
2165 */
2166 LLVMValueRef
2167 lp_build_itrunc(struct lp_build_context *bld,
2168 LLVMValueRef a)
2169 {
2170 LLVMBuilderRef builder = bld->gallivm->builder;
2171 const struct lp_type type = bld->type;
2172 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2173
2174 assert(type.floating);
2175 assert(lp_check_value(type, a));
2176
2177 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2178 }
2179
2180
2181 /**
2182 * Return float (vector) rounded to nearest integer (vector). The returned
2183 * value is an integer (vector).
2184 * Ex: iround(0.9) = 1
2185 * Ex: iround(-1.5) = -2
2186 */
2187 LLVMValueRef
2188 lp_build_iround(struct lp_build_context *bld,
2189 LLVMValueRef a)
2190 {
2191 LLVMBuilderRef builder = bld->gallivm->builder;
2192 const struct lp_type type = bld->type;
2193 LLVMTypeRef int_vec_type = bld->int_vec_type;
2194 LLVMValueRef res;
2195
2196 assert(type.floating);
2197
2198 assert(lp_check_value(type, a));
2199
2200 if ((util_cpu_caps.has_sse2 &&
2201 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2202 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2203 return lp_build_iround_nearest_sse2(bld, a);
2204 }
2205 if (arch_rounding_available(type)) {
2206 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2207 }
2208 else {
2209 LLVMValueRef half;
2210
2211 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2212
2213 if (type.sign) {
2214 LLVMTypeRef vec_type = bld->vec_type;
2215 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2216 (unsigned long long)1 << (type.width - 1));
2217 LLVMValueRef sign;
2218
2219 /* get sign bit */
2220 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2221 sign = LLVMBuildAnd(builder, sign, mask, "");
2222
2223 /* sign * 0.5 */
2224 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2225 half = LLVMBuildOr(builder, sign, half, "");
2226 half = LLVMBuildBitCast(builder, half, vec_type, "");
2227 }
2228
2229 res = LLVMBuildFAdd(builder, a, half, "");
2230 }
2231
2232 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2233
2234 return res;
2235 }
2236
2237
2238 /**
2239 * Return floor of float (vector), result is an int (vector)
2240 * Ex: ifloor(1.1) = 1.0
2241 * Ex: ifloor(-1.1) = -2.0
2242 */
2243 LLVMValueRef
2244 lp_build_ifloor(struct lp_build_context *bld,
2245 LLVMValueRef a)
2246 {
2247 LLVMBuilderRef builder = bld->gallivm->builder;
2248 const struct lp_type type = bld->type;
2249 LLVMTypeRef int_vec_type = bld->int_vec_type;
2250 LLVMValueRef res;
2251
2252 assert(type.floating);
2253 assert(lp_check_value(type, a));
2254
2255 res = a;
2256 if (type.sign) {
2257 if (arch_rounding_available(type)) {
2258 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2259 }
2260 else {
2261 struct lp_type inttype;
2262 struct lp_build_context intbld;
2263 LLVMValueRef trunc, itrunc, mask;
2264
2265 assert(type.floating);
2266 assert(lp_check_value(type, a));
2267
2268 inttype = type;
2269 inttype.floating = 0;
2270 lp_build_context_init(&intbld, bld->gallivm, inttype);
2271
2272 /* round by truncation */
2273 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2274 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2275
2276 /*
2277 * fix values if rounding is wrong (for non-special cases)
2278 * - this is the case if trunc > a
2279 * The results of doing this with NaNs, very large values etc.
2280 * are undefined but this seems to be the case anyway.
2281 */
2282 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2283 /* cheapie minus one with mask since the mask is minus one / zero */
2284 return lp_build_add(&intbld, itrunc, mask);
2285 }
2286 }
2287
2288 /* round to nearest (toward zero) */
2289 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2290
2291 return res;
2292 }
2293
2294
2295 /**
2296 * Return ceiling of float (vector), returning int (vector).
2297 * Ex: iceil( 1.1) = 2
2298 * Ex: iceil(-1.1) = -1
2299 */
2300 LLVMValueRef
2301 lp_build_iceil(struct lp_build_context *bld,
2302 LLVMValueRef a)
2303 {
2304 LLVMBuilderRef builder = bld->gallivm->builder;
2305 const struct lp_type type = bld->type;
2306 LLVMTypeRef int_vec_type = bld->int_vec_type;
2307 LLVMValueRef res;
2308
2309 assert(type.floating);
2310 assert(lp_check_value(type, a));
2311
2312 if (arch_rounding_available(type)) {
2313 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2314 }
2315 else {
2316 struct lp_type inttype;
2317 struct lp_build_context intbld;
2318 LLVMValueRef trunc, itrunc, mask;
2319
2320 assert(type.floating);
2321 assert(lp_check_value(type, a));
2322
2323 inttype = type;
2324 inttype.floating = 0;
2325 lp_build_context_init(&intbld, bld->gallivm, inttype);
2326
2327 /* round by truncation */
2328 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2329 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2330
2331 /*
2332 * fix values if rounding is wrong (for non-special cases)
2333 * - this is the case if trunc < a
2334 * The results of doing this with NaNs, very large values etc.
2335 * are undefined but this seems to be the case anyway.
2336 */
2337 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2338 /* cheapie plus one with mask since the mask is minus one / zero */
2339 return lp_build_sub(&intbld, itrunc, mask);
2340 }
2341
2342 /* round to nearest (toward zero) */
2343 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2344
2345 return res;
2346 }
2347
2348
2349 /**
2350 * Combined ifloor() & fract().
2351 *
2352 * Preferred to calling the functions separately, as it will ensure that the
2353 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2354 */
2355 void
2356 lp_build_ifloor_fract(struct lp_build_context *bld,
2357 LLVMValueRef a,
2358 LLVMValueRef *out_ipart,
2359 LLVMValueRef *out_fpart)
2360 {
2361 LLVMBuilderRef builder = bld->gallivm->builder;
2362 const struct lp_type type = bld->type;
2363 LLVMValueRef ipart;
2364
2365 assert(type.floating);
2366 assert(lp_check_value(type, a));
2367
2368 if (arch_rounding_available(type)) {
2369 /*
2370 * floor() is easier.
2371 */
2372
2373 ipart = lp_build_floor(bld, a);
2374 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2375 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2376 }
2377 else {
2378 /*
2379 * ifloor() is easier.
2380 */
2381
2382 *out_ipart = lp_build_ifloor(bld, a);
2383 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2384 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2385 }
2386 }
2387
2388
2389 /**
2390 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2391 * always smaller than one.
2392 */
2393 void
2394 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2395 LLVMValueRef a,
2396 LLVMValueRef *out_ipart,
2397 LLVMValueRef *out_fpart)
2398 {
2399 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2400 *out_fpart = clamp_fract(bld, *out_fpart);
2401 }
2402
2403
2404 LLVMValueRef
2405 lp_build_sqrt(struct lp_build_context *bld,
2406 LLVMValueRef a)
2407 {
2408 LLVMBuilderRef builder = bld->gallivm->builder;
2409 const struct lp_type type = bld->type;
2410 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2411 char intrinsic[32];
2412
2413 assert(lp_check_value(type, a));
2414
2415 assert(type.floating);
2416 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2417
2418 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2419 }
2420
2421
2422 /**
2423 * Do one Newton-Raphson step to improve reciprocate precision:
2424 *
2425 * x_{i+1} = x_i * (2 - a * x_i)
2426 *
2427 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2428 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2429 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2430 * halo. It would be necessary to clamp the argument to prevent this.
2431 *
2432 * See also:
2433 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2434 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2435 */
2436 static inline LLVMValueRef
2437 lp_build_rcp_refine(struct lp_build_context *bld,
2438 LLVMValueRef a,
2439 LLVMValueRef rcp_a)
2440 {
2441 LLVMBuilderRef builder = bld->gallivm->builder;
2442 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2443 LLVMValueRef res;
2444
2445 res = LLVMBuildFMul(builder, a, rcp_a, "");
2446 res = LLVMBuildFSub(builder, two, res, "");
2447 res = LLVMBuildFMul(builder, rcp_a, res, "");
2448
2449 return res;
2450 }
2451
2452
2453 LLVMValueRef
2454 lp_build_rcp(struct lp_build_context *bld,
2455 LLVMValueRef a)
2456 {
2457 LLVMBuilderRef builder = bld->gallivm->builder;
2458 const struct lp_type type = bld->type;
2459
2460 assert(lp_check_value(type, a));
2461
2462 if(a == bld->zero)
2463 return bld->undef;
2464 if(a == bld->one)
2465 return bld->one;
2466 if(a == bld->undef)
2467 return bld->undef;
2468
2469 assert(type.floating);
2470
2471 if(LLVMIsConstant(a))
2472 return LLVMConstFDiv(bld->one, a);
2473
2474 /*
2475 * We don't use RCPPS because:
2476 * - it only has 10bits of precision
2477 * - it doesn't even get the reciprocate of 1.0 exactly
2478 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2479 * - for recent processors the benefit over DIVPS is marginal, a case
2480 * dependent
2481 *
2482 * We could still use it on certain processors if benchmarks show that the
2483 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2484 * particular uses that require less workarounds.
2485 */
2486
2487 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2488 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2489 const unsigned num_iterations = 0;
2490 LLVMValueRef res;
2491 unsigned i;
2492 const char *intrinsic = NULL;
2493
2494 if (type.length == 4) {
2495 intrinsic = "llvm.x86.sse.rcp.ps";
2496 }
2497 else {
2498 intrinsic = "llvm.x86.avx.rcp.ps.256";
2499 }
2500
2501 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2502
2503 for (i = 0; i < num_iterations; ++i) {
2504 res = lp_build_rcp_refine(bld, a, res);
2505 }
2506
2507 return res;
2508 }
2509
2510 return LLVMBuildFDiv(builder, bld->one, a, "");
2511 }
2512
2513
2514 /**
2515 * Do one Newton-Raphson step to improve rsqrt precision:
2516 *
2517 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2518 *
2519 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2520 */
2521 static inline LLVMValueRef
2522 lp_build_rsqrt_refine(struct lp_build_context *bld,
2523 LLVMValueRef a,
2524 LLVMValueRef rsqrt_a)
2525 {
2526 LLVMBuilderRef builder = bld->gallivm->builder;
2527 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2528 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2529 LLVMValueRef res;
2530
2531 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2532 res = LLVMBuildFMul(builder, a, res, "");
2533 res = LLVMBuildFSub(builder, three, res, "");
2534 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2535 res = LLVMBuildFMul(builder, half, res, "");
2536
2537 return res;
2538 }
2539
2540
2541 /**
2542 * Generate 1/sqrt(a).
2543 * Result is undefined for values < 0, infinity for +0.
2544 */
2545 LLVMValueRef
2546 lp_build_rsqrt(struct lp_build_context *bld,
2547 LLVMValueRef a)
2548 {
2549 const struct lp_type type = bld->type;
2550
2551 assert(lp_check_value(type, a));
2552
2553 assert(type.floating);
2554
2555 /*
2556 * This should be faster but all denormals will end up as infinity.
2557 */
2558 if (0 && lp_build_fast_rsqrt_available(type)) {
2559 const unsigned num_iterations = 1;
2560 LLVMValueRef res;
2561 unsigned i;
2562
2563 /* rsqrt(1.0) != 1.0 here */
2564 res = lp_build_fast_rsqrt(bld, a);
2565
2566 if (num_iterations) {
2567 /*
2568 * Newton-Raphson will result in NaN instead of infinity for zero,
2569 * and NaN instead of zero for infinity.
2570 * Also, need to ensure rsqrt(1.0) == 1.0.
2571 * All numbers smaller than FLT_MIN will result in +infinity
2572 * (rsqrtps treats all denormals as zero).
2573 */
2574 LLVMValueRef cmp;
2575 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2576 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2577
2578 for (i = 0; i < num_iterations; ++i) {
2579 res = lp_build_rsqrt_refine(bld, a, res);
2580 }
2581 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2582 res = lp_build_select(bld, cmp, inf, res);
2583 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2584 res = lp_build_select(bld, cmp, bld->zero, res);
2585 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2586 res = lp_build_select(bld, cmp, bld->one, res);
2587 }
2588
2589 return res;
2590 }
2591
2592 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2593 }
2594
2595 /**
2596 * If there's a fast (inaccurate) rsqrt instruction available
2597 * (caller may want to avoid to call rsqrt_fast if it's not available,
2598 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2599 * unavailable it would result in sqrt/div/mul so obviously
2600 * much better to just call sqrt, skipping both div and mul).
2601 */
2602 boolean
2603 lp_build_fast_rsqrt_available(struct lp_type type)
2604 {
2605 assert(type.floating);
2606
2607 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2608 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2609 return true;
2610 }
2611 return false;
2612 }
2613
2614
2615 /**
2616 * Generate 1/sqrt(a).
2617 * Result is undefined for values < 0, infinity for +0.
2618 * Precision is limited, only ~10 bits guaranteed
2619 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2620 */
2621 LLVMValueRef
2622 lp_build_fast_rsqrt(struct lp_build_context *bld,
2623 LLVMValueRef a)
2624 {
2625 LLVMBuilderRef builder = bld->gallivm->builder;
2626 const struct lp_type type = bld->type;
2627
2628 assert(lp_check_value(type, a));
2629
2630 if (lp_build_fast_rsqrt_available(type)) {
2631 const char *intrinsic = NULL;
2632
2633 if (type.length == 4) {
2634 intrinsic = "llvm.x86.sse.rsqrt.ps";
2635 }
2636 else {
2637 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2638 }
2639 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2640 }
2641 else {
2642 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2643 }
2644 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2645 }
2646
2647
2648 /**
2649 * Generate sin(a) or cos(a) using polynomial approximation.
2650 * TODO: it might be worth recognizing sin and cos using same source
2651 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2652 * would be way cheaper than calculating (nearly) everything twice...
2653 * Not sure it's common enough to be worth bothering however, scs
2654 * opcode could also benefit from calculating both though.
2655 */
2656 static LLVMValueRef
2657 lp_build_sin_or_cos(struct lp_build_context *bld,
2658 LLVMValueRef a,
2659 boolean cos)
2660 {
2661 struct gallivm_state *gallivm = bld->gallivm;
2662 LLVMBuilderRef b = gallivm->builder;
2663 struct lp_type int_type = lp_int_type(bld->type);
2664
2665 /*
2666 * take the absolute value,
2667 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2668 */
2669
2670 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2671 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2672
2673 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2674 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2675
2676 /*
2677 * scale by 4/Pi
2678 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2679 */
2680
2681 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2682 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2683
2684 /*
2685 * store the integer part of y in mm0
2686 * emm2 = _mm_cvttps_epi32(y);
2687 */
2688
2689 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2690
2691 /*
2692 * j=(j+1) & (~1) (see the cephes sources)
2693 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2694 */
2695
2696 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2697 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2698 /*
2699 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2700 */
2701 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2702 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2703
2704 /*
2705 * y = _mm_cvtepi32_ps(emm2);
2706 */
2707 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2708
2709 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2710 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2711 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2712 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2713
2714 /*
2715 * Argument used for poly selection and sign bit determination
2716 * is different for sin vs. cos.
2717 */
2718 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2719 emm2_and;
2720
2721 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2722 LLVMBuildNot(b, emm2_2, ""), ""),
2723 const_29, "sign_bit") :
2724 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2725 LLVMBuildShl(b, emm2_add,
2726 const_29, ""), ""),
2727 sign_mask, "sign_bit");
2728
2729 /*
2730 * get the polynom selection mask
2731 * there is one polynom for 0 <= x <= Pi/4
2732 * and another one for Pi/4<x<=Pi/2
2733 * Both branches will be computed.
2734 *
2735 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2736 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2737 */
2738
2739 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2740 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2741 int_type, PIPE_FUNC_EQUAL,
2742 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2743
2744 /*
2745 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2746 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2747 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2748 */
2749 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2750 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2751 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2752
2753 /*
2754 * The magic pass: "Extended precision modular arithmetic"
2755 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2756 */
2757 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2758 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2759 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2760
2761 /*
2762 * Evaluate the first polynom (0 <= x <= Pi/4)
2763 *
2764 * z = _mm_mul_ps(x,x);
2765 */
2766 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2767
2768 /*
2769 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2770 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2771 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2772 */
2773 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2774 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2775 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2776
2777 /*
2778 * y = *(v4sf*)_ps_coscof_p0;
2779 * y = _mm_mul_ps(y, z);
2780 */
2781 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2782 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2783 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2784 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2785
2786
2787 /*
2788 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2789 * y = _mm_sub_ps(y, tmp);
2790 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2791 */
2792 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2793 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2794 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2795 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2796 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2797
2798 /*
2799 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2800 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2801 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2802 */
2803 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2804 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2805 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2806
2807 /*
2808 * Evaluate the second polynom (Pi/4 <= x <= 0)
2809 *
2810 * y2 = *(v4sf*)_ps_sincof_p0;
2811 * y2 = _mm_mul_ps(y2, z);
2812 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2813 * y2 = _mm_mul_ps(y2, z);
2814 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2815 * y2 = _mm_mul_ps(y2, z);
2816 * y2 = _mm_mul_ps(y2, x);
2817 * y2 = _mm_add_ps(y2, x);
2818 */
2819
2820 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2821 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2822 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2823 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2824
2825 /*
2826 * select the correct result from the two polynoms
2827 * xmm3 = poly_mask;
2828 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2829 * y = _mm_andnot_ps(xmm3, y);
2830 * y = _mm_or_ps(y,y2);
2831 */
2832 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2833 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2834 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2835 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2836 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2837 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2838
2839 /*
2840 * update the sign
2841 * y = _mm_xor_ps(y, sign_bit);
2842 */
2843 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
2844 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2845
2846 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
2847
2848 /* clamp output to be within [-1, 1] */
2849 y_result = lp_build_clamp(bld, y_result,
2850 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
2851 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
2852 /* If a is -inf, inf or NaN then return NaN */
2853 y_result = lp_build_select(bld, isfinite, y_result,
2854 lp_build_const_vec(bld->gallivm, bld->type, NAN));
2855 return y_result;
2856 }
2857
2858
2859 /**
2860 * Generate sin(a)
2861 */
2862 LLVMValueRef
2863 lp_build_sin(struct lp_build_context *bld,
2864 LLVMValueRef a)
2865 {
2866 return lp_build_sin_or_cos(bld, a, FALSE);
2867 }
2868
2869
2870 /**
2871 * Generate cos(a)
2872 */
2873 LLVMValueRef
2874 lp_build_cos(struct lp_build_context *bld,
2875 LLVMValueRef a)
2876 {
2877 return lp_build_sin_or_cos(bld, a, TRUE);
2878 }
2879
2880
2881 /**
2882 * Generate pow(x, y)
2883 */
2884 LLVMValueRef
2885 lp_build_pow(struct lp_build_context *bld,
2886 LLVMValueRef x,
2887 LLVMValueRef y)
2888 {
2889 /* TODO: optimize the constant case */
2890 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2891 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2892 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2893 __FUNCTION__);
2894 }
2895
2896 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2897 }
2898
2899
2900 /**
2901 * Generate exp(x)
2902 */
2903 LLVMValueRef
2904 lp_build_exp(struct lp_build_context *bld,
2905 LLVMValueRef x)
2906 {
2907 /* log2(e) = 1/log(2) */
2908 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2909 1.4426950408889634);
2910
2911 assert(lp_check_value(bld->type, x));
2912
2913 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2914 }
2915
2916
2917 /**
2918 * Generate log(x)
2919 * Behavior is undefined with infs, 0s and nans
2920 */
2921 LLVMValueRef
2922 lp_build_log(struct lp_build_context *bld,
2923 LLVMValueRef x)
2924 {
2925 /* log(2) */
2926 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2927 0.69314718055994529);
2928
2929 assert(lp_check_value(bld->type, x));
2930
2931 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2932 }
2933
2934 /**
2935 * Generate log(x) that handles edge cases (infs, 0s and nans)
2936 */
2937 LLVMValueRef
2938 lp_build_log_safe(struct lp_build_context *bld,
2939 LLVMValueRef x)
2940 {
2941 /* log(2) */
2942 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2943 0.69314718055994529);
2944
2945 assert(lp_check_value(bld->type, x));
2946
2947 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
2948 }
2949
2950
2951 /**
2952 * Generate polynomial.
2953 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2954 */
2955 LLVMValueRef
2956 lp_build_polynomial(struct lp_build_context *bld,
2957 LLVMValueRef x,
2958 const double *coeffs,
2959 unsigned num_coeffs)
2960 {
2961 const struct lp_type type = bld->type;
2962 LLVMValueRef even = NULL, odd = NULL;
2963 LLVMValueRef x2;
2964 unsigned i;
2965
2966 assert(lp_check_value(bld->type, x));
2967
2968 /* TODO: optimize the constant case */
2969 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2970 LLVMIsConstant(x)) {
2971 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2972 __FUNCTION__);
2973 }
2974
2975 /*
2976 * Calculate odd and even terms seperately to decrease data dependency
2977 * Ex:
2978 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2979 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2980 */
2981 x2 = lp_build_mul(bld, x, x);
2982
2983 for (i = num_coeffs; i--; ) {
2984 LLVMValueRef coeff;
2985
2986 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2987
2988 if (i % 2 == 0) {
2989 if (even)
2990 even = lp_build_mad(bld, x2, even, coeff);
2991 else
2992 even = coeff;
2993 } else {
2994 if (odd)
2995 odd = lp_build_mad(bld, x2, odd, coeff);
2996 else
2997 odd = coeff;
2998 }
2999 }
3000
3001 if (odd)
3002 return lp_build_mad(bld, odd, x, even);
3003 else if (even)
3004 return even;
3005 else
3006 return bld->undef;
3007 }
3008
3009
3010 /**
3011 * Minimax polynomial fit of 2**x, in range [0, 1[
3012 */
3013 const double lp_build_exp2_polynomial[] = {
3014 #if EXP_POLY_DEGREE == 5
3015 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3016 0.693153073200168932794,
3017 0.240153617044375388211,
3018 0.0558263180532956664775,
3019 0.00898934009049466391101,
3020 0.00187757667519147912699
3021 #elif EXP_POLY_DEGREE == 4
3022 1.00000259337069434683,
3023 0.693003834469974940458,
3024 0.24144275689150793076,
3025 0.0520114606103070150235,
3026 0.0135341679161270268764
3027 #elif EXP_POLY_DEGREE == 3
3028 0.999925218562710312959,
3029 0.695833540494823811697,
3030 0.226067155427249155588,
3031 0.0780245226406372992967
3032 #elif EXP_POLY_DEGREE == 2
3033 1.00172476321474503578,
3034 0.657636275736077639316,
3035 0.33718943461968720704
3036 #else
3037 #error
3038 #endif
3039 };
3040
3041
3042 LLVMValueRef
3043 lp_build_exp2(struct lp_build_context *bld,
3044 LLVMValueRef x)
3045 {
3046 LLVMBuilderRef builder = bld->gallivm->builder;
3047 const struct lp_type type = bld->type;
3048 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3049 LLVMValueRef ipart = NULL;
3050 LLVMValueRef fpart = NULL;
3051 LLVMValueRef expipart = NULL;
3052 LLVMValueRef expfpart = NULL;
3053 LLVMValueRef res = NULL;
3054
3055 assert(lp_check_value(bld->type, x));
3056
3057 /* TODO: optimize the constant case */
3058 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3059 LLVMIsConstant(x)) {
3060 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3061 __FUNCTION__);
3062 }
3063
3064 assert(type.floating && type.width == 32);
3065
3066 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3067 * the result is INF and if it's smaller than -126.9 the result is 0 */
3068 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3069 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3070 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3071 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3072
3073 /* ipart = floor(x) */
3074 /* fpart = x - ipart */
3075 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3076
3077 /* expipart = (float) (1 << ipart) */
3078 expipart = LLVMBuildAdd(builder, ipart,
3079 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3080 expipart = LLVMBuildShl(builder, expipart,
3081 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3082 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3083
3084 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3085 ARRAY_SIZE(lp_build_exp2_polynomial));
3086
3087 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3088
3089 return res;
3090 }
3091
3092
3093
3094 /**
3095 * Extract the exponent of a IEEE-754 floating point value.
3096 *
3097 * Optionally apply an integer bias.
3098 *
3099 * Result is an integer value with
3100 *
3101 * ifloor(log2(x)) + bias
3102 */
3103 LLVMValueRef
3104 lp_build_extract_exponent(struct lp_build_context *bld,
3105 LLVMValueRef x,
3106 int bias)
3107 {
3108 LLVMBuilderRef builder = bld->gallivm->builder;
3109 const struct lp_type type = bld->type;
3110 unsigned mantissa = lp_mantissa(type);
3111 LLVMValueRef res;
3112
3113 assert(type.floating);
3114
3115 assert(lp_check_value(bld->type, x));
3116
3117 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3118
3119 res = LLVMBuildLShr(builder, x,
3120 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3121 res = LLVMBuildAnd(builder, res,
3122 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3123 res = LLVMBuildSub(builder, res,
3124 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3125
3126 return res;
3127 }
3128
3129
3130 /**
3131 * Extract the mantissa of the a floating.
3132 *
3133 * Result is a floating point value with
3134 *
3135 * x / floor(log2(x))
3136 */
3137 LLVMValueRef
3138 lp_build_extract_mantissa(struct lp_build_context *bld,
3139 LLVMValueRef x)
3140 {
3141 LLVMBuilderRef builder = bld->gallivm->builder;
3142 const struct lp_type type = bld->type;
3143 unsigned mantissa = lp_mantissa(type);
3144 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3145 (1ULL << mantissa) - 1);
3146 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3147 LLVMValueRef res;
3148
3149 assert(lp_check_value(bld->type, x));
3150
3151 assert(type.floating);
3152
3153 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3154
3155 /* res = x / 2**ipart */
3156 res = LLVMBuildAnd(builder, x, mantmask, "");
3157 res = LLVMBuildOr(builder, res, one, "");
3158 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3159
3160 return res;
3161 }
3162
3163
3164
3165 /**
3166 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3167 * These coefficients can be generate with
3168 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3169 */
3170 const double lp_build_log2_polynomial[] = {
3171 #if LOG_POLY_DEGREE == 5
3172 2.88539008148777786488L,
3173 0.961796878841293367824L,
3174 0.577058946784739859012L,
3175 0.412914355135828735411L,
3176 0.308591899232910175289L,
3177 0.352376952300281371868L,
3178 #elif LOG_POLY_DEGREE == 4
3179 2.88539009343309178325L,
3180 0.961791550404184197881L,
3181 0.577440339438736392009L,
3182 0.403343858251329912514L,
3183 0.406718052498846252698L,
3184 #elif LOG_POLY_DEGREE == 3
3185 2.88538959748872753838L,
3186 0.961932915889597772928L,
3187 0.571118517972136195241L,
3188 0.493997535084709500285L,
3189 #else
3190 #error
3191 #endif
3192 };
3193
3194 /**
3195 * See http://www.devmaster.net/forums/showthread.php?p=43580
3196 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3197 * http://www.nezumi.demon.co.uk/consult/logx.htm
3198 *
3199 * If handle_edge_cases is true the function will perform computations
3200 * to match the required D3D10+ behavior for each of the edge cases.
3201 * That means that if input is:
3202 * - less than zero (to and including -inf) then NaN will be returned
3203 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3204 * - +infinity, then +infinity will be returned
3205 * - NaN, then NaN will be returned
3206 *
3207 * Those checks are fairly expensive so if you don't need them make sure
3208 * handle_edge_cases is false.
3209 */
3210 void
3211 lp_build_log2_approx(struct lp_build_context *bld,
3212 LLVMValueRef x,
3213 LLVMValueRef *p_exp,
3214 LLVMValueRef *p_floor_log2,
3215 LLVMValueRef *p_log2,
3216 boolean handle_edge_cases)
3217 {
3218 LLVMBuilderRef builder = bld->gallivm->builder;
3219 const struct lp_type type = bld->type;
3220 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3221 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3222
3223 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3224 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3225 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3226
3227 LLVMValueRef i = NULL;
3228 LLVMValueRef y = NULL;
3229 LLVMValueRef z = NULL;
3230 LLVMValueRef exp = NULL;
3231 LLVMValueRef mant = NULL;
3232 LLVMValueRef logexp = NULL;
3233 LLVMValueRef p_z = NULL;
3234 LLVMValueRef res = NULL;
3235
3236 assert(lp_check_value(bld->type, x));
3237
3238 if(p_exp || p_floor_log2 || p_log2) {
3239 /* TODO: optimize the constant case */
3240 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3241 LLVMIsConstant(x)) {
3242 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3243 __FUNCTION__);
3244 }
3245
3246 assert(type.floating && type.width == 32);
3247
3248 /*
3249 * We don't explicitly handle denormalized numbers. They will yield a
3250 * result in the neighbourhood of -127, which appears to be adequate
3251 * enough.
3252 */
3253
3254 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3255
3256 /* exp = (float) exponent(x) */
3257 exp = LLVMBuildAnd(builder, i, expmask, "");
3258 }
3259
3260 if(p_floor_log2 || p_log2) {
3261 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3262 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3263 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3264 }
3265
3266 if (p_log2) {
3267 /* mant = 1 + (float) mantissa(x) */
3268 mant = LLVMBuildAnd(builder, i, mantmask, "");
3269 mant = LLVMBuildOr(builder, mant, one, "");
3270 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3271
3272 /* y = (mant - 1) / (mant + 1) */
3273 y = lp_build_div(bld,
3274 lp_build_sub(bld, mant, bld->one),
3275 lp_build_add(bld, mant, bld->one)
3276 );
3277
3278 /* z = y^2 */
3279 z = lp_build_mul(bld, y, y);
3280
3281 /* compute P(z) */
3282 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3283 ARRAY_SIZE(lp_build_log2_polynomial));
3284
3285 /* y * P(z) + logexp */
3286 res = lp_build_mad(bld, y, p_z, logexp);
3287
3288 if (type.floating && handle_edge_cases) {
3289 LLVMValueRef negmask, infmask, zmask;
3290 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3291 lp_build_const_vec(bld->gallivm, type, 0.0f));
3292 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3293 lp_build_const_vec(bld->gallivm, type, 0.0f));
3294 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3295 lp_build_const_vec(bld->gallivm, type, INFINITY));
3296
3297 /* If x is qual to inf make sure we return inf */
3298 res = lp_build_select(bld, infmask,
3299 lp_build_const_vec(bld->gallivm, type, INFINITY),
3300 res);
3301 /* If x is qual to 0, return -inf */
3302 res = lp_build_select(bld, zmask,
3303 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3304 res);
3305 /* If x is nan or less than 0, return nan */
3306 res = lp_build_select(bld, negmask,
3307 lp_build_const_vec(bld->gallivm, type, NAN),
3308 res);
3309 }
3310 }
3311
3312 if (p_exp) {
3313 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3314 *p_exp = exp;
3315 }
3316
3317 if (p_floor_log2)
3318 *p_floor_log2 = logexp;
3319
3320 if (p_log2)
3321 *p_log2 = res;
3322 }
3323
3324
3325 /*
3326 * log2 implementation which doesn't have special code to
3327 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3328 * the results for those cases are undefined.
3329 */
3330 LLVMValueRef
3331 lp_build_log2(struct lp_build_context *bld,
3332 LLVMValueRef x)
3333 {
3334 LLVMValueRef res;
3335 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3336 return res;
3337 }
3338
3339 /*
3340 * Version of log2 which handles all edge cases.
3341 * Look at documentation of lp_build_log2_approx for
3342 * description of the behavior for each of the edge cases.
3343 */
3344 LLVMValueRef
3345 lp_build_log2_safe(struct lp_build_context *bld,
3346 LLVMValueRef x)
3347 {
3348 LLVMValueRef res;
3349 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3350 return res;
3351 }
3352
3353
3354 /**
3355 * Faster (and less accurate) log2.
3356 *
3357 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3358 *
3359 * Piece-wise linear approximation, with exact results when x is a
3360 * power of two.
3361 *
3362 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3363 */
3364 LLVMValueRef
3365 lp_build_fast_log2(struct lp_build_context *bld,
3366 LLVMValueRef x)
3367 {
3368 LLVMBuilderRef builder = bld->gallivm->builder;
3369 LLVMValueRef ipart;
3370 LLVMValueRef fpart;
3371
3372 assert(lp_check_value(bld->type, x));
3373
3374 assert(bld->type.floating);
3375
3376 /* ipart = floor(log2(x)) - 1 */
3377 ipart = lp_build_extract_exponent(bld, x, -1);
3378 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3379
3380 /* fpart = x / 2**ipart */
3381 fpart = lp_build_extract_mantissa(bld, x);
3382
3383 /* ipart + fpart */
3384 return LLVMBuildFAdd(builder, ipart, fpart, "");
3385 }
3386
3387
3388 /**
3389 * Fast implementation of iround(log2(x)).
3390 *
3391 * Not an approximation -- it should give accurate results all the time.
3392 */
3393 LLVMValueRef
3394 lp_build_ilog2(struct lp_build_context *bld,
3395 LLVMValueRef x)
3396 {
3397 LLVMBuilderRef builder = bld->gallivm->builder;
3398 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3399 LLVMValueRef ipart;
3400
3401 assert(bld->type.floating);
3402
3403 assert(lp_check_value(bld->type, x));
3404
3405 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3406 x = LLVMBuildFMul(builder, x, sqrt2, "");
3407
3408 /* ipart = floor(log2(x) + 0.5) */
3409 ipart = lp_build_extract_exponent(bld, x, 0);
3410
3411 return ipart;
3412 }
3413
3414 LLVMValueRef
3415 lp_build_mod(struct lp_build_context *bld,
3416 LLVMValueRef x,
3417 LLVMValueRef y)
3418 {
3419 LLVMBuilderRef builder = bld->gallivm->builder;
3420 LLVMValueRef res;
3421 const struct lp_type type = bld->type;
3422
3423 assert(lp_check_value(type, x));
3424 assert(lp_check_value(type, y));
3425
3426 if (type.floating)
3427 res = LLVMBuildFRem(builder, x, y, "");
3428 else if (type.sign)
3429 res = LLVMBuildSRem(builder, x, y, "");
3430 else
3431 res = LLVMBuildURem(builder, x, y, "");
3432 return res;
3433 }
3434
3435
3436 /*
3437 * For floating inputs it creates and returns a mask
3438 * which is all 1's for channels which are NaN.
3439 * Channels inside x which are not NaN will be 0.
3440 */
3441 LLVMValueRef
3442 lp_build_isnan(struct lp_build_context *bld,
3443 LLVMValueRef x)
3444 {
3445 LLVMValueRef mask;
3446 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3447
3448 assert(bld->type.floating);
3449 assert(lp_check_value(bld->type, x));
3450
3451 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3452 "isnotnan");
3453 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3454 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3455 return mask;
3456 }
3457
3458 /* Returns all 1's for floating point numbers that are
3459 * finite numbers and returns all zeros for -inf,
3460 * inf and nan's */
3461 LLVMValueRef
3462 lp_build_isfinite(struct lp_build_context *bld,
3463 LLVMValueRef x)
3464 {
3465 LLVMBuilderRef builder = bld->gallivm->builder;
3466 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3467 struct lp_type int_type = lp_int_type(bld->type);
3468 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3469 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3470 0x7f800000);
3471
3472 if (!bld->type.floating) {
3473 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3474 }
3475 assert(bld->type.floating);
3476 assert(lp_check_value(bld->type, x));
3477 assert(bld->type.width == 32);
3478
3479 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3480 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3481 intx, infornan32);
3482 }
3483
3484 /*
3485 * Returns true if the number is nan or inf and false otherwise.
3486 * The input has to be a floating point vector.
3487 */
3488 LLVMValueRef
3489 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3490 const struct lp_type type,
3491 LLVMValueRef x)
3492 {
3493 LLVMBuilderRef builder = gallivm->builder;
3494 struct lp_type int_type = lp_int_type(type);
3495 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3496 0x7f800000);
3497 LLVMValueRef ret;
3498
3499 assert(type.floating);
3500
3501 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3502 ret = LLVMBuildAnd(builder, ret, const0, "");
3503 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3504 ret, const0);
3505
3506 return ret;
3507 }
3508
3509
3510 LLVMValueRef
3511 lp_build_fpstate_get(struct gallivm_state *gallivm)
3512 {
3513 if (util_cpu_caps.has_sse) {
3514 LLVMBuilderRef builder = gallivm->builder;
3515 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3516 gallivm,
3517 LLVMInt32TypeInContext(gallivm->context),
3518 "mxcsr_ptr");
3519 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3520 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3521 lp_build_intrinsic(builder,
3522 "llvm.x86.sse.stmxcsr",
3523 LLVMVoidTypeInContext(gallivm->context),
3524 &mxcsr_ptr8, 1, 0);
3525 return mxcsr_ptr;
3526 }
3527 return 0;
3528 }
3529
3530 void
3531 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3532 boolean zero)
3533 {
3534 if (util_cpu_caps.has_sse) {
3535 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3536 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3537
3538 LLVMBuilderRef builder = gallivm->builder;
3539 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3540 LLVMValueRef mxcsr =
3541 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3542
3543 if (util_cpu_caps.has_daz) {
3544 /* Enable denormals are zero mode */
3545 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3546 }
3547 if (zero) {
3548 mxcsr = LLVMBuildOr(builder, mxcsr,
3549 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3550 } else {
3551 mxcsr = LLVMBuildAnd(builder, mxcsr,
3552 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3553 }
3554
3555 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3556 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3557 }
3558 }
3559
3560 void
3561 lp_build_fpstate_set(struct gallivm_state *gallivm,
3562 LLVMValueRef mxcsr_ptr)
3563 {
3564 if (util_cpu_caps.has_sse) {
3565 LLVMBuilderRef builder = gallivm->builder;
3566 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3567 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3568 lp_build_intrinsic(builder,
3569 "llvm.x86.sse.ldmxcsr",
3570 LLVMVoidTypeInContext(gallivm->context),
3571 &mxcsr_ptr, 1, 0);
3572 }
3573 }