gallivm: use fallback code for mul_hi with llvm >= 7.0
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if (a == bld->zero)
545 return b;
546 if (b == bld->zero)
547 return a;
548 if (a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if (type.norm) {
552 const char *intrinsic = NULL;
553
554 if (!type.sign && (a == bld->one || b == bld->one))
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (HAVE_LLVM >= 0x0900) {
559 char intrin[32];
560 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
561 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
562 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
563 }
564 if (type.width * type.length == 128) {
565 if (util_cpu_caps.has_sse2) {
566 if (type.width == 8)
567 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
568 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
569 if (type.width == 16)
570 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
571 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
572 } else if (util_cpu_caps.has_altivec) {
573 if (type.width == 8)
574 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
575 if (type.width == 16)
576 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
577 }
578 }
579 if (type.width * type.length == 256) {
580 if (util_cpu_caps.has_avx2) {
581 if (type.width == 8)
582 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
583 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
584 if (type.width == 16)
585 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
586 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
587 }
588 }
589 }
590
591 if (intrinsic)
592 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
593 }
594
595 if(type.norm && !type.floating && !type.fixed) {
596 if (type.sign) {
597 uint64_t sign = (uint64_t)1 << (type.width - 1);
598 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
599 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
600 /* a_clamp_max is the maximum a for positive b,
601 a_clamp_min is the minimum a for negative b. */
602 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
603 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
604 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
605 }
606 }
607
608 if(LLVMIsConstant(a) && LLVMIsConstant(b))
609 if (type.floating)
610 res = LLVMConstFAdd(a, b);
611 else
612 res = LLVMConstAdd(a, b);
613 else
614 if (type.floating)
615 res = LLVMBuildFAdd(builder, a, b, "");
616 else
617 res = LLVMBuildAdd(builder, a, b, "");
618
619 /* clamp to ceiling of 1.0 */
620 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
621 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
622
623 if (type.norm && !type.floating && !type.fixed) {
624 if (!type.sign) {
625 /*
626 * newer llvm versions no longer support the intrinsics, but recognize
627 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
628 * code, it is important we match the pattern llvm uses (and pray llvm
629 * doesn't change it - and hope they decide on the same pattern for
630 * all backends supporting it...).
631 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
632 * interfere with llvm's ability to recognize the pattern but seems
633 * a bit brittle.
634 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
635 */
636 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
637 res = lp_build_select(bld, overflowed,
638 LLVMConstAllOnes(bld->int_vec_type), res);
639 }
640 }
641
642 /* XXX clamp to floor of -1 or 0??? */
643
644 return res;
645 }
646
647
648 /** Return the scalar sum of the elements of a.
649 * Should avoid this operation whenever possible.
650 */
651 LLVMValueRef
652 lp_build_horizontal_add(struct lp_build_context *bld,
653 LLVMValueRef a)
654 {
655 LLVMBuilderRef builder = bld->gallivm->builder;
656 const struct lp_type type = bld->type;
657 LLVMValueRef index, res;
658 unsigned i, length;
659 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
660 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
661 LLVMValueRef vecres, elem2;
662
663 assert(lp_check_value(type, a));
664
665 if (type.length == 1) {
666 return a;
667 }
668
669 assert(!bld->type.norm);
670
671 /*
672 * for byte vectors can do much better with psadbw.
673 * Using repeated shuffle/adds here. Note with multiple vectors
674 * this can be done more efficiently as outlined in the intel
675 * optimization manual.
676 * Note: could cause data rearrangement if used with smaller element
677 * sizes.
678 */
679
680 vecres = a;
681 length = type.length / 2;
682 while (length > 1) {
683 LLVMValueRef vec1, vec2;
684 for (i = 0; i < length; i++) {
685 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
686 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
687 }
688 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
689 LLVMConstVector(shuffles1, length), "");
690 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
691 LLVMConstVector(shuffles2, length), "");
692 if (type.floating) {
693 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
694 }
695 else {
696 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
697 }
698 length = length >> 1;
699 }
700
701 /* always have vector of size 2 here */
702 assert(length == 1);
703
704 index = lp_build_const_int32(bld->gallivm, 0);
705 res = LLVMBuildExtractElement(builder, vecres, index, "");
706 index = lp_build_const_int32(bld->gallivm, 1);
707 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
708
709 if (type.floating)
710 res = LLVMBuildFAdd(builder, res, elem2, "");
711 else
712 res = LLVMBuildAdd(builder, res, elem2, "");
713
714 return res;
715 }
716
717 /**
718 * Return the horizontal sums of 4 float vectors as a float4 vector.
719 * This uses the technique as outlined in Intel Optimization Manual.
720 */
721 static LLVMValueRef
722 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
723 LLVMValueRef src[4])
724 {
725 struct gallivm_state *gallivm = bld->gallivm;
726 LLVMBuilderRef builder = gallivm->builder;
727 LLVMValueRef shuffles[4];
728 LLVMValueRef tmp[4];
729 LLVMValueRef sumtmp[2], shuftmp[2];
730
731 /* lower half of regs */
732 shuffles[0] = lp_build_const_int32(gallivm, 0);
733 shuffles[1] = lp_build_const_int32(gallivm, 1);
734 shuffles[2] = lp_build_const_int32(gallivm, 4);
735 shuffles[3] = lp_build_const_int32(gallivm, 5);
736 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
737 LLVMConstVector(shuffles, 4), "");
738 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
739 LLVMConstVector(shuffles, 4), "");
740
741 /* upper half of regs */
742 shuffles[0] = lp_build_const_int32(gallivm, 2);
743 shuffles[1] = lp_build_const_int32(gallivm, 3);
744 shuffles[2] = lp_build_const_int32(gallivm, 6);
745 shuffles[3] = lp_build_const_int32(gallivm, 7);
746 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
747 LLVMConstVector(shuffles, 4), "");
748 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
749 LLVMConstVector(shuffles, 4), "");
750
751 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
752 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
753
754 shuffles[0] = lp_build_const_int32(gallivm, 0);
755 shuffles[1] = lp_build_const_int32(gallivm, 2);
756 shuffles[2] = lp_build_const_int32(gallivm, 4);
757 shuffles[3] = lp_build_const_int32(gallivm, 6);
758 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
759 LLVMConstVector(shuffles, 4), "");
760
761 shuffles[0] = lp_build_const_int32(gallivm, 1);
762 shuffles[1] = lp_build_const_int32(gallivm, 3);
763 shuffles[2] = lp_build_const_int32(gallivm, 5);
764 shuffles[3] = lp_build_const_int32(gallivm, 7);
765 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
766 LLVMConstVector(shuffles, 4), "");
767
768 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
769 }
770
771
772 /*
773 * partially horizontally add 2-4 float vectors with length nx4,
774 * i.e. only four adjacent values in each vector will be added,
775 * assuming values are really grouped in 4 which also determines
776 * output order.
777 *
778 * Return a vector of the same length as the initial vectors,
779 * with the excess elements (if any) being undefined.
780 * The element order is independent of number of input vectors.
781 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
782 * the output order thus will be
783 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
784 */
785 LLVMValueRef
786 lp_build_hadd_partial4(struct lp_build_context *bld,
787 LLVMValueRef vectors[],
788 unsigned num_vecs)
789 {
790 struct gallivm_state *gallivm = bld->gallivm;
791 LLVMBuilderRef builder = gallivm->builder;
792 LLVMValueRef ret_vec;
793 LLVMValueRef tmp[4];
794 const char *intrinsic = NULL;
795
796 assert(num_vecs >= 2 && num_vecs <= 4);
797 assert(bld->type.floating);
798
799 /* only use this with at least 2 vectors, as it is sort of expensive
800 * (depending on cpu) and we always need two horizontal adds anyway,
801 * so a shuffle/add approach might be better.
802 */
803
804 tmp[0] = vectors[0];
805 tmp[1] = vectors[1];
806
807 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
808 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
809
810 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
811 bld->type.length == 4) {
812 intrinsic = "llvm.x86.sse3.hadd.ps";
813 }
814 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
815 bld->type.length == 8) {
816 intrinsic = "llvm.x86.avx.hadd.ps.256";
817 }
818 if (intrinsic) {
819 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
820 lp_build_vec_type(gallivm, bld->type),
821 tmp[0], tmp[1]);
822 if (num_vecs > 2) {
823 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
824 lp_build_vec_type(gallivm, bld->type),
825 tmp[2], tmp[3]);
826 }
827 else {
828 tmp[1] = tmp[0];
829 }
830 return lp_build_intrinsic_binary(builder, intrinsic,
831 lp_build_vec_type(gallivm, bld->type),
832 tmp[0], tmp[1]);
833 }
834
835 if (bld->type.length == 4) {
836 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
837 }
838 else {
839 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
840 unsigned j;
841 unsigned num_iter = bld->type.length / 4;
842 struct lp_type parttype = bld->type;
843 parttype.length = 4;
844 for (j = 0; j < num_iter; j++) {
845 LLVMValueRef partsrc[4];
846 unsigned i;
847 for (i = 0; i < 4; i++) {
848 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
849 }
850 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
851 }
852 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
853 }
854 return ret_vec;
855 }
856
857 /**
858 * Generate a - b
859 */
860 LLVMValueRef
861 lp_build_sub(struct lp_build_context *bld,
862 LLVMValueRef a,
863 LLVMValueRef b)
864 {
865 LLVMBuilderRef builder = bld->gallivm->builder;
866 const struct lp_type type = bld->type;
867 LLVMValueRef res;
868
869 assert(lp_check_value(type, a));
870 assert(lp_check_value(type, b));
871
872 if (b == bld->zero)
873 return a;
874 if (a == bld->undef || b == bld->undef)
875 return bld->undef;
876 if (a == b)
877 return bld->zero;
878
879 if (type.norm) {
880 const char *intrinsic = NULL;
881
882 if (!type.sign && b == bld->one)
883 return bld->zero;
884
885 if (!type.floating && !type.fixed) {
886 if (HAVE_LLVM >= 0x0900) {
887 char intrin[32];
888 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
889 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
890 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
891 }
892 if (type.width * type.length == 128) {
893 if (util_cpu_caps.has_sse2) {
894 if (type.width == 8)
895 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
896 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
897 if (type.width == 16)
898 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
899 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
900 } else if (util_cpu_caps.has_altivec) {
901 if (type.width == 8)
902 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
903 if (type.width == 16)
904 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
905 }
906 }
907 if (type.width * type.length == 256) {
908 if (util_cpu_caps.has_avx2) {
909 if (type.width == 8)
910 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
911 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
912 if (type.width == 16)
913 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
914 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
915 }
916 }
917 }
918
919 if (intrinsic)
920 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
921 }
922
923 if(type.norm && !type.floating && !type.fixed) {
924 if (type.sign) {
925 uint64_t sign = (uint64_t)1 << (type.width - 1);
926 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
927 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
928 /* a_clamp_max is the maximum a for negative b,
929 a_clamp_min is the minimum a for positive b. */
930 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
931 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
932 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
933 } else {
934 /*
935 * This must match llvm pattern for saturated unsigned sub.
936 * (lp_build_max_simple actually does the job with its current
937 * definition but do it explicitly here.)
938 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
939 * interfere with llvm's ability to recognize the pattern but seems
940 * a bit brittle.
941 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
942 */
943 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
944 a = lp_build_select(bld, no_ov, a, b);
945 }
946 }
947
948 if(LLVMIsConstant(a) && LLVMIsConstant(b))
949 if (type.floating)
950 res = LLVMConstFSub(a, b);
951 else
952 res = LLVMConstSub(a, b);
953 else
954 if (type.floating)
955 res = LLVMBuildFSub(builder, a, b, "");
956 else
957 res = LLVMBuildSub(builder, a, b, "");
958
959 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
960 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
961
962 return res;
963 }
964
965
966
967 /**
968 * Normalized multiplication.
969 *
970 * There are several approaches for (using 8-bit normalized multiplication as
971 * an example):
972 *
973 * - alpha plus one
974 *
975 * makes the following approximation to the division (Sree)
976 *
977 * a*b/255 ~= (a*(b + 1)) >> 256
978 *
979 * which is the fastest method that satisfies the following OpenGL criteria of
980 *
981 * 0*0 = 0 and 255*255 = 255
982 *
983 * - geometric series
984 *
985 * takes the geometric series approximation to the division
986 *
987 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
988 *
989 * in this case just the first two terms to fit in 16bit arithmetic
990 *
991 * t/255 ~= (t + (t >> 8)) >> 8
992 *
993 * note that just by itself it doesn't satisfies the OpenGL criteria, as
994 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
995 * must be used.
996 *
997 * - geometric series plus rounding
998 *
999 * when using a geometric series division instead of truncating the result
1000 * use roundoff in the approximation (Jim Blinn)
1001 *
1002 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
1003 *
1004 * achieving the exact results.
1005 *
1006 *
1007 *
1008 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1009 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1010 * @sa Michael Herf, The "double blend trick", May 2000,
1011 * http://www.stereopsis.com/doubleblend.html
1012 */
1013 LLVMValueRef
1014 lp_build_mul_norm(struct gallivm_state *gallivm,
1015 struct lp_type wide_type,
1016 LLVMValueRef a, LLVMValueRef b)
1017 {
1018 LLVMBuilderRef builder = gallivm->builder;
1019 struct lp_build_context bld;
1020 unsigned n;
1021 LLVMValueRef half;
1022 LLVMValueRef ab;
1023
1024 assert(!wide_type.floating);
1025 assert(lp_check_value(wide_type, a));
1026 assert(lp_check_value(wide_type, b));
1027
1028 lp_build_context_init(&bld, gallivm, wide_type);
1029
1030 n = wide_type.width / 2;
1031 if (wide_type.sign) {
1032 --n;
1033 }
1034
1035 /*
1036 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1037 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1038 */
1039
1040 /*
1041 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1042 */
1043
1044 ab = LLVMBuildMul(builder, a, b, "");
1045 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1046
1047 /*
1048 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1049 */
1050
1051 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1052 if (wide_type.sign) {
1053 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1054 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1055 half = lp_build_select(&bld, sign, minus_half, half);
1056 }
1057 ab = LLVMBuildAdd(builder, ab, half, "");
1058
1059 /* Final division */
1060 ab = lp_build_shr_imm(&bld, ab, n);
1061
1062 return ab;
1063 }
1064
1065 /**
1066 * Generate a * b
1067 */
1068 LLVMValueRef
1069 lp_build_mul(struct lp_build_context *bld,
1070 LLVMValueRef a,
1071 LLVMValueRef b)
1072 {
1073 LLVMBuilderRef builder = bld->gallivm->builder;
1074 const struct lp_type type = bld->type;
1075 LLVMValueRef shift;
1076 LLVMValueRef res;
1077
1078 assert(lp_check_value(type, a));
1079 assert(lp_check_value(type, b));
1080
1081 if(a == bld->zero)
1082 return bld->zero;
1083 if(a == bld->one)
1084 return b;
1085 if(b == bld->zero)
1086 return bld->zero;
1087 if(b == bld->one)
1088 return a;
1089 if(a == bld->undef || b == bld->undef)
1090 return bld->undef;
1091
1092 if (!type.floating && !type.fixed && type.norm) {
1093 struct lp_type wide_type = lp_wider_type(type);
1094 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1095
1096 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1097 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1098
1099 /* PMULLW, PSRLW, PADDW */
1100 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1101 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1102
1103 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1104
1105 return ab;
1106 }
1107
1108 if(type.fixed)
1109 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1110 else
1111 shift = NULL;
1112
1113 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1114 if (type.floating)
1115 res = LLVMConstFMul(a, b);
1116 else
1117 res = LLVMConstMul(a, b);
1118 if(shift) {
1119 if(type.sign)
1120 res = LLVMConstAShr(res, shift);
1121 else
1122 res = LLVMConstLShr(res, shift);
1123 }
1124 }
1125 else {
1126 if (type.floating)
1127 res = LLVMBuildFMul(builder, a, b, "");
1128 else
1129 res = LLVMBuildMul(builder, a, b, "");
1130 if(shift) {
1131 if(type.sign)
1132 res = LLVMBuildAShr(builder, res, shift, "");
1133 else
1134 res = LLVMBuildLShr(builder, res, shift, "");
1135 }
1136 }
1137
1138 return res;
1139 }
1140
1141 /*
1142 * Widening mul, valid for 32x32 bit -> 64bit only.
1143 * Result is low 32bits, high bits returned in res_hi.
1144 *
1145 * Emits code that is meant to be compiled for the host CPU.
1146 */
1147 LLVMValueRef
1148 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1149 LLVMValueRef a,
1150 LLVMValueRef b,
1151 LLVMValueRef *res_hi)
1152 {
1153 struct gallivm_state *gallivm = bld->gallivm;
1154 LLVMBuilderRef builder = gallivm->builder;
1155
1156 assert(bld->type.width == 32);
1157 assert(bld->type.floating == 0);
1158 assert(bld->type.fixed == 0);
1159 assert(bld->type.norm == 0);
1160
1161 /*
1162 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1163 * for x86 simd is atrocious (even if the high bits weren't required),
1164 * trying to handle real 64bit inputs (which of course can't happen due
1165 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1166 * apparently llvm does not recognize this widening mul). This includes 6
1167 * (instead of 2) pmuludq plus extra adds and shifts
1168 * The same story applies to signed mul, albeit fixing this requires sse41.
1169 * https://llvm.org/bugs/show_bug.cgi?id=30845
1170 * So, whip up our own code, albeit only for length 4 and 8 (which
1171 * should be good enough)...
1172 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1173 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1174 * for signed), which the fallback code does not, without this llvm
1175 * will likely still produce atrocious code.
1176 */
1177 if (HAVE_LLVM < 0x0700 &&
1178 (bld->type.length == 4 || bld->type.length == 8) &&
1179 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1180 util_cpu_caps.has_sse4_1)) {
1181 const char *intrinsic = NULL;
1182 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1183 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1184 struct lp_type type_wide = lp_wider_type(bld->type);
1185 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1186 unsigned i;
1187 for (i = 0; i < bld->type.length; i += 2) {
1188 shuf[i] = lp_build_const_int32(gallivm, i+1);
1189 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1190 }
1191 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1192 aeven = a;
1193 beven = b;
1194 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1195 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1196
1197 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1198 if (bld->type.sign) {
1199 intrinsic = "llvm.x86.avx2.pmul.dq";
1200 } else {
1201 intrinsic = "llvm.x86.avx2.pmulu.dq";
1202 }
1203 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1204 wider_type, aeven, beven);
1205 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1206 wider_type, aodd, bodd);
1207 }
1208 else {
1209 /* for consistent naming look elsewhere... */
1210 if (bld->type.sign) {
1211 intrinsic = "llvm.x86.sse41.pmuldq";
1212 } else {
1213 intrinsic = "llvm.x86.sse2.pmulu.dq";
1214 }
1215 /*
1216 * XXX If we only have AVX but not AVX2 this is a pain.
1217 * lp_build_intrinsic_binary_anylength() can't handle it
1218 * (due to src and dst type not being identical).
1219 */
1220 if (bld->type.length == 8) {
1221 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1222 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1223 LLVMValueRef muleven2[2], mulodd2[2];
1224 struct lp_type type_wide_half = type_wide;
1225 LLVMTypeRef wtype_half;
1226 type_wide_half.length = 2;
1227 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1228 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1229 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1230 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1231 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1232 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1233 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1234 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1235 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1236 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1237 wtype_half, aevenlo, bevenlo);
1238 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1239 wtype_half, aoddlo, boddlo);
1240 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1241 wtype_half, aevenhi, bevenhi);
1242 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1243 wtype_half, aoddhi, boddhi);
1244 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1245 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1246
1247 }
1248 else {
1249 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1250 wider_type, aeven, beven);
1251 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1252 wider_type, aodd, bodd);
1253 }
1254 }
1255 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1256 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1257
1258 for (i = 0; i < bld->type.length; i += 2) {
1259 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1260 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1261 }
1262 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1263 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1264
1265 for (i = 0; i < bld->type.length; i += 2) {
1266 shuf[i] = lp_build_const_int32(gallivm, i);
1267 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1268 }
1269 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1270 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1271 }
1272 else {
1273 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1274 }
1275 }
1276
1277
1278 /*
1279 * Widening mul, valid for 32x32 bit -> 64bit only.
1280 * Result is low 32bits, high bits returned in res_hi.
1281 *
1282 * Emits generic code.
1283 */
1284 LLVMValueRef
1285 lp_build_mul_32_lohi(struct lp_build_context *bld,
1286 LLVMValueRef a,
1287 LLVMValueRef b,
1288 LLVMValueRef *res_hi)
1289 {
1290 struct gallivm_state *gallivm = bld->gallivm;
1291 LLVMBuilderRef builder = gallivm->builder;
1292 LLVMValueRef tmp, shift, res_lo;
1293 struct lp_type type_tmp;
1294 LLVMTypeRef wide_type, narrow_type;
1295
1296 type_tmp = bld->type;
1297 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1298 type_tmp.width *= 2;
1299 wide_type = lp_build_vec_type(gallivm, type_tmp);
1300 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1301
1302 if (bld->type.sign) {
1303 a = LLVMBuildSExt(builder, a, wide_type, "");
1304 b = LLVMBuildSExt(builder, b, wide_type, "");
1305 } else {
1306 a = LLVMBuildZExt(builder, a, wide_type, "");
1307 b = LLVMBuildZExt(builder, b, wide_type, "");
1308 }
1309 tmp = LLVMBuildMul(builder, a, b, "");
1310
1311 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1312
1313 /* Since we truncate anyway, LShr and AShr are equivalent. */
1314 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1315 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1316
1317 return res_lo;
1318 }
1319
1320
1321 /* a * b + c */
1322 LLVMValueRef
1323 lp_build_mad(struct lp_build_context *bld,
1324 LLVMValueRef a,
1325 LLVMValueRef b,
1326 LLVMValueRef c)
1327 {
1328 const struct lp_type type = bld->type;
1329 if (type.floating) {
1330 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1331 } else {
1332 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1333 }
1334 }
1335
1336
1337 /**
1338 * Small vector x scale multiplication optimization.
1339 */
1340 LLVMValueRef
1341 lp_build_mul_imm(struct lp_build_context *bld,
1342 LLVMValueRef a,
1343 int b)
1344 {
1345 LLVMBuilderRef builder = bld->gallivm->builder;
1346 LLVMValueRef factor;
1347
1348 assert(lp_check_value(bld->type, a));
1349
1350 if(b == 0)
1351 return bld->zero;
1352
1353 if(b == 1)
1354 return a;
1355
1356 if(b == -1)
1357 return lp_build_negate(bld, a);
1358
1359 if(b == 2 && bld->type.floating)
1360 return lp_build_add(bld, a, a);
1361
1362 if(util_is_power_of_two_or_zero(b)) {
1363 unsigned shift = ffs(b) - 1;
1364
1365 if(bld->type.floating) {
1366 #if 0
1367 /*
1368 * Power of two multiplication by directly manipulating the exponent.
1369 *
1370 * XXX: This might not be always faster, it will introduce a small error
1371 * for multiplication by zero, and it will produce wrong results
1372 * for Inf and NaN.
1373 */
1374 unsigned mantissa = lp_mantissa(bld->type);
1375 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1376 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1377 a = LLVMBuildAdd(builder, a, factor, "");
1378 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1379 return a;
1380 #endif
1381 }
1382 else {
1383 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1384 return LLVMBuildShl(builder, a, factor, "");
1385 }
1386 }
1387
1388 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1389 return lp_build_mul(bld, a, factor);
1390 }
1391
1392
1393 /**
1394 * Generate a / b
1395 */
1396 LLVMValueRef
1397 lp_build_div(struct lp_build_context *bld,
1398 LLVMValueRef a,
1399 LLVMValueRef b)
1400 {
1401 LLVMBuilderRef builder = bld->gallivm->builder;
1402 const struct lp_type type = bld->type;
1403
1404 assert(lp_check_value(type, a));
1405 assert(lp_check_value(type, b));
1406
1407 if(a == bld->zero)
1408 return bld->zero;
1409 if(a == bld->one && type.floating)
1410 return lp_build_rcp(bld, b);
1411 if(b == bld->zero)
1412 return bld->undef;
1413 if(b == bld->one)
1414 return a;
1415 if(a == bld->undef || b == bld->undef)
1416 return bld->undef;
1417
1418 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1419 if (type.floating)
1420 return LLVMConstFDiv(a, b);
1421 else if (type.sign)
1422 return LLVMConstSDiv(a, b);
1423 else
1424 return LLVMConstUDiv(a, b);
1425 }
1426
1427 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1428 if(FALSE &&
1429 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1430 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1431 type.floating)
1432 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1433
1434 if (type.floating)
1435 return LLVMBuildFDiv(builder, a, b, "");
1436 else if (type.sign)
1437 return LLVMBuildSDiv(builder, a, b, "");
1438 else
1439 return LLVMBuildUDiv(builder, a, b, "");
1440 }
1441
1442
1443 /**
1444 * Linear interpolation helper.
1445 *
1446 * @param normalized whether we are interpolating normalized values,
1447 * encoded in normalized integers, twice as wide.
1448 *
1449 * @sa http://www.stereopsis.com/doubleblend.html
1450 */
1451 static inline LLVMValueRef
1452 lp_build_lerp_simple(struct lp_build_context *bld,
1453 LLVMValueRef x,
1454 LLVMValueRef v0,
1455 LLVMValueRef v1,
1456 unsigned flags)
1457 {
1458 unsigned half_width = bld->type.width/2;
1459 LLVMBuilderRef builder = bld->gallivm->builder;
1460 LLVMValueRef delta;
1461 LLVMValueRef res;
1462
1463 assert(lp_check_value(bld->type, x));
1464 assert(lp_check_value(bld->type, v0));
1465 assert(lp_check_value(bld->type, v1));
1466
1467 delta = lp_build_sub(bld, v1, v0);
1468
1469 if (bld->type.floating) {
1470 assert(flags == 0);
1471 return lp_build_mad(bld, x, delta, v0);
1472 }
1473
1474 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1475 if (!bld->type.sign) {
1476 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1477 /*
1478 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1479 * most-significant-bit to the lowest-significant-bit, so that
1480 * later we can just divide by 2**n instead of 2**n - 1.
1481 */
1482
1483 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1484 }
1485
1486 /* (x * delta) >> n */
1487 res = lp_build_mul(bld, x, delta);
1488 res = lp_build_shr_imm(bld, res, half_width);
1489 } else {
1490 /*
1491 * The rescaling trick above doesn't work for signed numbers, so
1492 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1493 * instead.
1494 */
1495 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1496 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1497 }
1498 } else {
1499 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1500 res = lp_build_mul(bld, x, delta);
1501 }
1502
1503 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1504 /*
1505 * At this point both res and v0 only use the lower half of the bits,
1506 * the rest is zero. Instead of add / mask, do add with half wide type.
1507 */
1508 struct lp_type narrow_type;
1509 struct lp_build_context narrow_bld;
1510
1511 memset(&narrow_type, 0, sizeof narrow_type);
1512 narrow_type.sign = bld->type.sign;
1513 narrow_type.width = bld->type.width/2;
1514 narrow_type.length = bld->type.length*2;
1515
1516 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1517 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1518 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1519 res = lp_build_add(&narrow_bld, v0, res);
1520 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1521 } else {
1522 res = lp_build_add(bld, v0, res);
1523
1524 if (bld->type.fixed) {
1525 /*
1526 * We need to mask out the high order bits when lerping 8bit
1527 * normalized colors stored on 16bits
1528 */
1529 /* XXX: This step is necessary for lerping 8bit colors stored on
1530 * 16bits, but it will be wrong for true fixed point use cases.
1531 * Basically we need a more powerful lp_type, capable of further
1532 * distinguishing the values interpretation from the value storage.
1533 */
1534 LLVMValueRef low_bits;
1535 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1536 res = LLVMBuildAnd(builder, res, low_bits, "");
1537 }
1538 }
1539
1540 return res;
1541 }
1542
1543
1544 /**
1545 * Linear interpolation.
1546 */
1547 LLVMValueRef
1548 lp_build_lerp(struct lp_build_context *bld,
1549 LLVMValueRef x,
1550 LLVMValueRef v0,
1551 LLVMValueRef v1,
1552 unsigned flags)
1553 {
1554 const struct lp_type type = bld->type;
1555 LLVMValueRef res;
1556
1557 assert(lp_check_value(type, x));
1558 assert(lp_check_value(type, v0));
1559 assert(lp_check_value(type, v1));
1560
1561 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1562
1563 if (type.norm) {
1564 struct lp_type wide_type;
1565 struct lp_build_context wide_bld;
1566 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1567
1568 assert(type.length >= 2);
1569
1570 /*
1571 * Create a wider integer type, enough to hold the
1572 * intermediate result of the multiplication.
1573 */
1574 memset(&wide_type, 0, sizeof wide_type);
1575 wide_type.sign = type.sign;
1576 wide_type.width = type.width*2;
1577 wide_type.length = type.length/2;
1578
1579 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1580
1581 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1582 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1583 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1584
1585 /*
1586 * Lerp both halves.
1587 */
1588
1589 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1590
1591 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1592 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1593
1594 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1595 } else {
1596 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1597 }
1598
1599 return res;
1600 }
1601
1602
1603 /**
1604 * Bilinear interpolation.
1605 *
1606 * Values indices are in v_{yx}.
1607 */
1608 LLVMValueRef
1609 lp_build_lerp_2d(struct lp_build_context *bld,
1610 LLVMValueRef x,
1611 LLVMValueRef y,
1612 LLVMValueRef v00,
1613 LLVMValueRef v01,
1614 LLVMValueRef v10,
1615 LLVMValueRef v11,
1616 unsigned flags)
1617 {
1618 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1619 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1620 return lp_build_lerp(bld, y, v0, v1, flags);
1621 }
1622
1623
1624 LLVMValueRef
1625 lp_build_lerp_3d(struct lp_build_context *bld,
1626 LLVMValueRef x,
1627 LLVMValueRef y,
1628 LLVMValueRef z,
1629 LLVMValueRef v000,
1630 LLVMValueRef v001,
1631 LLVMValueRef v010,
1632 LLVMValueRef v011,
1633 LLVMValueRef v100,
1634 LLVMValueRef v101,
1635 LLVMValueRef v110,
1636 LLVMValueRef v111,
1637 unsigned flags)
1638 {
1639 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1640 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1641 return lp_build_lerp(bld, z, v0, v1, flags);
1642 }
1643
1644
1645 /**
1646 * Generate min(a, b)
1647 * Do checks for special cases but not for nans.
1648 */
1649 LLVMValueRef
1650 lp_build_min(struct lp_build_context *bld,
1651 LLVMValueRef a,
1652 LLVMValueRef b)
1653 {
1654 assert(lp_check_value(bld->type, a));
1655 assert(lp_check_value(bld->type, b));
1656
1657 if(a == bld->undef || b == bld->undef)
1658 return bld->undef;
1659
1660 if(a == b)
1661 return a;
1662
1663 if (bld->type.norm) {
1664 if (!bld->type.sign) {
1665 if (a == bld->zero || b == bld->zero) {
1666 return bld->zero;
1667 }
1668 }
1669 if(a == bld->one)
1670 return b;
1671 if(b == bld->one)
1672 return a;
1673 }
1674
1675 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1676 }
1677
1678
1679 /**
1680 * Generate min(a, b)
1681 * NaN's are handled according to the behavior specified by the
1682 * nan_behavior argument.
1683 */
1684 LLVMValueRef
1685 lp_build_min_ext(struct lp_build_context *bld,
1686 LLVMValueRef a,
1687 LLVMValueRef b,
1688 enum gallivm_nan_behavior nan_behavior)
1689 {
1690 assert(lp_check_value(bld->type, a));
1691 assert(lp_check_value(bld->type, b));
1692
1693 if(a == bld->undef || b == bld->undef)
1694 return bld->undef;
1695
1696 if(a == b)
1697 return a;
1698
1699 if (bld->type.norm) {
1700 if (!bld->type.sign) {
1701 if (a == bld->zero || b == bld->zero) {
1702 return bld->zero;
1703 }
1704 }
1705 if(a == bld->one)
1706 return b;
1707 if(b == bld->one)
1708 return a;
1709 }
1710
1711 return lp_build_min_simple(bld, a, b, nan_behavior);
1712 }
1713
1714 /**
1715 * Generate max(a, b)
1716 * Do checks for special cases, but NaN behavior is undefined.
1717 */
1718 LLVMValueRef
1719 lp_build_max(struct lp_build_context *bld,
1720 LLVMValueRef a,
1721 LLVMValueRef b)
1722 {
1723 assert(lp_check_value(bld->type, a));
1724 assert(lp_check_value(bld->type, b));
1725
1726 if(a == bld->undef || b == bld->undef)
1727 return bld->undef;
1728
1729 if(a == b)
1730 return a;
1731
1732 if(bld->type.norm) {
1733 if(a == bld->one || b == bld->one)
1734 return bld->one;
1735 if (!bld->type.sign) {
1736 if (a == bld->zero) {
1737 return b;
1738 }
1739 if (b == bld->zero) {
1740 return a;
1741 }
1742 }
1743 }
1744
1745 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1746 }
1747
1748
1749 /**
1750 * Generate max(a, b)
1751 * Checks for special cases.
1752 * NaN's are handled according to the behavior specified by the
1753 * nan_behavior argument.
1754 */
1755 LLVMValueRef
1756 lp_build_max_ext(struct lp_build_context *bld,
1757 LLVMValueRef a,
1758 LLVMValueRef b,
1759 enum gallivm_nan_behavior nan_behavior)
1760 {
1761 assert(lp_check_value(bld->type, a));
1762 assert(lp_check_value(bld->type, b));
1763
1764 if(a == bld->undef || b == bld->undef)
1765 return bld->undef;
1766
1767 if(a == b)
1768 return a;
1769
1770 if(bld->type.norm) {
1771 if(a == bld->one || b == bld->one)
1772 return bld->one;
1773 if (!bld->type.sign) {
1774 if (a == bld->zero) {
1775 return b;
1776 }
1777 if (b == bld->zero) {
1778 return a;
1779 }
1780 }
1781 }
1782
1783 return lp_build_max_simple(bld, a, b, nan_behavior);
1784 }
1785
1786 /**
1787 * Generate clamp(a, min, max)
1788 * NaN behavior (for any of a, min, max) is undefined.
1789 * Do checks for special cases.
1790 */
1791 LLVMValueRef
1792 lp_build_clamp(struct lp_build_context *bld,
1793 LLVMValueRef a,
1794 LLVMValueRef min,
1795 LLVMValueRef max)
1796 {
1797 assert(lp_check_value(bld->type, a));
1798 assert(lp_check_value(bld->type, min));
1799 assert(lp_check_value(bld->type, max));
1800
1801 a = lp_build_min(bld, a, max);
1802 a = lp_build_max(bld, a, min);
1803 return a;
1804 }
1805
1806
1807 /**
1808 * Generate clamp(a, 0, 1)
1809 * A NaN will get converted to zero.
1810 */
1811 LLVMValueRef
1812 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1813 LLVMValueRef a)
1814 {
1815 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1816 a = lp_build_min(bld, a, bld->one);
1817 return a;
1818 }
1819
1820
1821 /**
1822 * Generate abs(a)
1823 */
1824 LLVMValueRef
1825 lp_build_abs(struct lp_build_context *bld,
1826 LLVMValueRef a)
1827 {
1828 LLVMBuilderRef builder = bld->gallivm->builder;
1829 const struct lp_type type = bld->type;
1830 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1831
1832 assert(lp_check_value(type, a));
1833
1834 if(!type.sign)
1835 return a;
1836
1837 if(type.floating) {
1838 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1839 /* Workaround llvm.org/PR27332 */
1840 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1841 unsigned long long absMask = ~(1ULL << (type.width - 1));
1842 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1843 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1844 a = LLVMBuildAnd(builder, a, mask, "");
1845 a = LLVMBuildBitCast(builder, a, vec_type, "");
1846 return a;
1847 } else {
1848 char intrinsic[32];
1849 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1850 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1851 }
1852 }
1853
1854 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1855 switch(type.width) {
1856 case 8:
1857 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1858 case 16:
1859 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1860 case 32:
1861 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1862 }
1863 }
1864 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1865 switch(type.width) {
1866 case 8:
1867 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1868 case 16:
1869 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1870 case 32:
1871 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1872 }
1873 }
1874
1875 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1876 a, LLVMBuildNeg(builder, a, ""));
1877 }
1878
1879
1880 LLVMValueRef
1881 lp_build_negate(struct lp_build_context *bld,
1882 LLVMValueRef a)
1883 {
1884 LLVMBuilderRef builder = bld->gallivm->builder;
1885
1886 assert(lp_check_value(bld->type, a));
1887
1888 if (bld->type.floating)
1889 a = LLVMBuildFNeg(builder, a, "");
1890 else
1891 a = LLVMBuildNeg(builder, a, "");
1892
1893 return a;
1894 }
1895
1896
1897 /** Return -1, 0 or +1 depending on the sign of a */
1898 LLVMValueRef
1899 lp_build_sgn(struct lp_build_context *bld,
1900 LLVMValueRef a)
1901 {
1902 LLVMBuilderRef builder = bld->gallivm->builder;
1903 const struct lp_type type = bld->type;
1904 LLVMValueRef cond;
1905 LLVMValueRef res;
1906
1907 assert(lp_check_value(type, a));
1908
1909 /* Handle non-zero case */
1910 if(!type.sign) {
1911 /* if not zero then sign must be positive */
1912 res = bld->one;
1913 }
1914 else if(type.floating) {
1915 LLVMTypeRef vec_type;
1916 LLVMTypeRef int_type;
1917 LLVMValueRef mask;
1918 LLVMValueRef sign;
1919 LLVMValueRef one;
1920 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1921
1922 int_type = lp_build_int_vec_type(bld->gallivm, type);
1923 vec_type = lp_build_vec_type(bld->gallivm, type);
1924 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1925
1926 /* Take the sign bit and add it to 1 constant */
1927 sign = LLVMBuildBitCast(builder, a, int_type, "");
1928 sign = LLVMBuildAnd(builder, sign, mask, "");
1929 one = LLVMConstBitCast(bld->one, int_type);
1930 res = LLVMBuildOr(builder, sign, one, "");
1931 res = LLVMBuildBitCast(builder, res, vec_type, "");
1932 }
1933 else
1934 {
1935 /* signed int/norm/fixed point */
1936 /* could use psign with sse3 and appropriate vectors here */
1937 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1938 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1939 res = lp_build_select(bld, cond, bld->one, minus_one);
1940 }
1941
1942 /* Handle zero */
1943 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1944 res = lp_build_select(bld, cond, bld->zero, res);
1945
1946 return res;
1947 }
1948
1949
1950 /**
1951 * Set the sign of float vector 'a' according to 'sign'.
1952 * If sign==0, return abs(a).
1953 * If sign==1, return -abs(a);
1954 * Other values for sign produce undefined results.
1955 */
1956 LLVMValueRef
1957 lp_build_set_sign(struct lp_build_context *bld,
1958 LLVMValueRef a, LLVMValueRef sign)
1959 {
1960 LLVMBuilderRef builder = bld->gallivm->builder;
1961 const struct lp_type type = bld->type;
1962 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1963 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1964 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1965 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1966 ~((unsigned long long) 1 << (type.width - 1)));
1967 LLVMValueRef val, res;
1968
1969 assert(type.floating);
1970 assert(lp_check_value(type, a));
1971
1972 /* val = reinterpret_cast<int>(a) */
1973 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1974 /* val = val & mask */
1975 val = LLVMBuildAnd(builder, val, mask, "");
1976 /* sign = sign << shift */
1977 sign = LLVMBuildShl(builder, sign, shift, "");
1978 /* res = val | sign */
1979 res = LLVMBuildOr(builder, val, sign, "");
1980 /* res = reinterpret_cast<float>(res) */
1981 res = LLVMBuildBitCast(builder, res, vec_type, "");
1982
1983 return res;
1984 }
1985
1986
1987 /**
1988 * Convert vector of (or scalar) int to vector of (or scalar) float.
1989 */
1990 LLVMValueRef
1991 lp_build_int_to_float(struct lp_build_context *bld,
1992 LLVMValueRef a)
1993 {
1994 LLVMBuilderRef builder = bld->gallivm->builder;
1995 const struct lp_type type = bld->type;
1996 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1997
1998 assert(type.floating);
1999
2000 return LLVMBuildSIToFP(builder, a, vec_type, "");
2001 }
2002
2003 static boolean
2004 arch_rounding_available(const struct lp_type type)
2005 {
2006 if ((util_cpu_caps.has_sse4_1 &&
2007 (type.length == 1 || type.width*type.length == 128)) ||
2008 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
2009 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
2010 return TRUE;
2011 else if ((util_cpu_caps.has_altivec &&
2012 (type.width == 32 && type.length == 4)))
2013 return TRUE;
2014 else if (util_cpu_caps.has_neon)
2015 return TRUE;
2016
2017 return FALSE;
2018 }
2019
2020 enum lp_build_round_mode
2021 {
2022 LP_BUILD_ROUND_NEAREST = 0,
2023 LP_BUILD_ROUND_FLOOR = 1,
2024 LP_BUILD_ROUND_CEIL = 2,
2025 LP_BUILD_ROUND_TRUNCATE = 3
2026 };
2027
2028 static inline LLVMValueRef
2029 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2030 LLVMValueRef a)
2031 {
2032 LLVMBuilderRef builder = bld->gallivm->builder;
2033 const struct lp_type type = bld->type;
2034 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2035 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2036 const char *intrinsic;
2037 LLVMValueRef res;
2038
2039 assert(type.floating);
2040 /* using the double precision conversions is a bit more complicated */
2041 assert(type.width == 32);
2042
2043 assert(lp_check_value(type, a));
2044 assert(util_cpu_caps.has_sse2);
2045
2046 /* This is relying on MXCSR rounding mode, which should always be nearest. */
2047 if (type.length == 1) {
2048 LLVMTypeRef vec_type;
2049 LLVMValueRef undef;
2050 LLVMValueRef arg;
2051 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2052
2053 vec_type = LLVMVectorType(bld->elem_type, 4);
2054
2055 intrinsic = "llvm.x86.sse.cvtss2si";
2056
2057 undef = LLVMGetUndef(vec_type);
2058
2059 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2060
2061 res = lp_build_intrinsic_unary(builder, intrinsic,
2062 ret_type, arg);
2063 }
2064 else {
2065 if (type.width* type.length == 128) {
2066 intrinsic = "llvm.x86.sse2.cvtps2dq";
2067 }
2068 else {
2069 assert(type.width*type.length == 256);
2070 assert(util_cpu_caps.has_avx);
2071
2072 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2073 }
2074 res = lp_build_intrinsic_unary(builder, intrinsic,
2075 ret_type, a);
2076 }
2077
2078 return res;
2079 }
2080
2081
2082 /*
2083 */
2084 static inline LLVMValueRef
2085 lp_build_round_altivec(struct lp_build_context *bld,
2086 LLVMValueRef a,
2087 enum lp_build_round_mode mode)
2088 {
2089 LLVMBuilderRef builder = bld->gallivm->builder;
2090 const struct lp_type type = bld->type;
2091 const char *intrinsic = NULL;
2092
2093 assert(type.floating);
2094
2095 assert(lp_check_value(type, a));
2096 assert(util_cpu_caps.has_altivec);
2097
2098 (void)type;
2099
2100 switch (mode) {
2101 case LP_BUILD_ROUND_NEAREST:
2102 intrinsic = "llvm.ppc.altivec.vrfin";
2103 break;
2104 case LP_BUILD_ROUND_FLOOR:
2105 intrinsic = "llvm.ppc.altivec.vrfim";
2106 break;
2107 case LP_BUILD_ROUND_CEIL:
2108 intrinsic = "llvm.ppc.altivec.vrfip";
2109 break;
2110 case LP_BUILD_ROUND_TRUNCATE:
2111 intrinsic = "llvm.ppc.altivec.vrfiz";
2112 break;
2113 }
2114
2115 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2116 }
2117
2118 static inline LLVMValueRef
2119 lp_build_round_arch(struct lp_build_context *bld,
2120 LLVMValueRef a,
2121 enum lp_build_round_mode mode)
2122 {
2123 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2124 LLVMBuilderRef builder = bld->gallivm->builder;
2125 const struct lp_type type = bld->type;
2126 const char *intrinsic_root;
2127 char intrinsic[32];
2128
2129 assert(type.floating);
2130 assert(lp_check_value(type, a));
2131 (void)type;
2132
2133 switch (mode) {
2134 case LP_BUILD_ROUND_NEAREST:
2135 intrinsic_root = "llvm.nearbyint";
2136 break;
2137 case LP_BUILD_ROUND_FLOOR:
2138 intrinsic_root = "llvm.floor";
2139 break;
2140 case LP_BUILD_ROUND_CEIL:
2141 intrinsic_root = "llvm.ceil";
2142 break;
2143 case LP_BUILD_ROUND_TRUNCATE:
2144 intrinsic_root = "llvm.trunc";
2145 break;
2146 }
2147
2148 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2149 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2150 }
2151 else /* (util_cpu_caps.has_altivec) */
2152 return lp_build_round_altivec(bld, a, mode);
2153 }
2154
2155 /**
2156 * Return the integer part of a float (vector) value (== round toward zero).
2157 * The returned value is a float (vector).
2158 * Ex: trunc(-1.5) = -1.0
2159 */
2160 LLVMValueRef
2161 lp_build_trunc(struct lp_build_context *bld,
2162 LLVMValueRef a)
2163 {
2164 LLVMBuilderRef builder = bld->gallivm->builder;
2165 const struct lp_type type = bld->type;
2166
2167 assert(type.floating);
2168 assert(lp_check_value(type, a));
2169
2170 if (arch_rounding_available(type)) {
2171 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2172 }
2173 else {
2174 const struct lp_type type = bld->type;
2175 struct lp_type inttype;
2176 struct lp_build_context intbld;
2177 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2178 LLVMValueRef trunc, res, anosign, mask;
2179 LLVMTypeRef int_vec_type = bld->int_vec_type;
2180 LLVMTypeRef vec_type = bld->vec_type;
2181
2182 assert(type.width == 32); /* might want to handle doubles at some point */
2183
2184 inttype = type;
2185 inttype.floating = 0;
2186 lp_build_context_init(&intbld, bld->gallivm, inttype);
2187
2188 /* round by truncation */
2189 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2190 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2191
2192 /* mask out sign bit */
2193 anosign = lp_build_abs(bld, a);
2194 /*
2195 * mask out all values if anosign > 2^24
2196 * This should work both for large ints (all rounding is no-op for them
2197 * because such floats are always exact) as well as special cases like
2198 * NaNs, Infs (taking advantage of the fact they use max exponent).
2199 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2200 */
2201 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2202 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2203 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2204 return lp_build_select(bld, mask, a, res);
2205 }
2206 }
2207
2208
2209 /**
2210 * Return float (vector) rounded to nearest integer (vector). The returned
2211 * value is a float (vector).
2212 * Ex: round(0.9) = 1.0
2213 * Ex: round(-1.5) = -2.0
2214 */
2215 LLVMValueRef
2216 lp_build_round(struct lp_build_context *bld,
2217 LLVMValueRef a)
2218 {
2219 LLVMBuilderRef builder = bld->gallivm->builder;
2220 const struct lp_type type = bld->type;
2221
2222 assert(type.floating);
2223 assert(lp_check_value(type, a));
2224
2225 if (arch_rounding_available(type)) {
2226 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2227 }
2228 else {
2229 const struct lp_type type = bld->type;
2230 struct lp_type inttype;
2231 struct lp_build_context intbld;
2232 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2233 LLVMValueRef res, anosign, mask;
2234 LLVMTypeRef int_vec_type = bld->int_vec_type;
2235 LLVMTypeRef vec_type = bld->vec_type;
2236
2237 assert(type.width == 32); /* might want to handle doubles at some point */
2238
2239 inttype = type;
2240 inttype.floating = 0;
2241 lp_build_context_init(&intbld, bld->gallivm, inttype);
2242
2243 res = lp_build_iround(bld, a);
2244 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2245
2246 /* mask out sign bit */
2247 anosign = lp_build_abs(bld, a);
2248 /*
2249 * mask out all values if anosign > 2^24
2250 * This should work both for large ints (all rounding is no-op for them
2251 * because such floats are always exact) as well as special cases like
2252 * NaNs, Infs (taking advantage of the fact they use max exponent).
2253 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2254 */
2255 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2256 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2257 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2258 return lp_build_select(bld, mask, a, res);
2259 }
2260 }
2261
2262
2263 /**
2264 * Return floor of float (vector), result is a float (vector)
2265 * Ex: floor(1.1) = 1.0
2266 * Ex: floor(-1.1) = -2.0
2267 */
2268 LLVMValueRef
2269 lp_build_floor(struct lp_build_context *bld,
2270 LLVMValueRef a)
2271 {
2272 LLVMBuilderRef builder = bld->gallivm->builder;
2273 const struct lp_type type = bld->type;
2274
2275 assert(type.floating);
2276 assert(lp_check_value(type, a));
2277
2278 if (arch_rounding_available(type)) {
2279 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2280 }
2281 else {
2282 const struct lp_type type = bld->type;
2283 struct lp_type inttype;
2284 struct lp_build_context intbld;
2285 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2286 LLVMValueRef trunc, res, anosign, mask;
2287 LLVMTypeRef int_vec_type = bld->int_vec_type;
2288 LLVMTypeRef vec_type = bld->vec_type;
2289
2290 if (type.width != 32) {
2291 char intrinsic[32];
2292 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2293 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2294 }
2295
2296 assert(type.width == 32); /* might want to handle doubles at some point */
2297
2298 inttype = type;
2299 inttype.floating = 0;
2300 lp_build_context_init(&intbld, bld->gallivm, inttype);
2301
2302 /* round by truncation */
2303 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2304 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2305
2306 if (type.sign) {
2307 LLVMValueRef tmp;
2308
2309 /*
2310 * fix values if rounding is wrong (for non-special cases)
2311 * - this is the case if trunc > a
2312 */
2313 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2314 /* tmp = trunc > a ? 1.0 : 0.0 */
2315 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2316 tmp = lp_build_and(&intbld, mask, tmp);
2317 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2318 res = lp_build_sub(bld, res, tmp);
2319 }
2320
2321 /* mask out sign bit */
2322 anosign = lp_build_abs(bld, a);
2323 /*
2324 * mask out all values if anosign > 2^24
2325 * This should work both for large ints (all rounding is no-op for them
2326 * because such floats are always exact) as well as special cases like
2327 * NaNs, Infs (taking advantage of the fact they use max exponent).
2328 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2329 */
2330 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2331 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2332 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2333 return lp_build_select(bld, mask, a, res);
2334 }
2335 }
2336
2337
2338 /**
2339 * Return ceiling of float (vector), returning float (vector).
2340 * Ex: ceil( 1.1) = 2.0
2341 * Ex: ceil(-1.1) = -1.0
2342 */
2343 LLVMValueRef
2344 lp_build_ceil(struct lp_build_context *bld,
2345 LLVMValueRef a)
2346 {
2347 LLVMBuilderRef builder = bld->gallivm->builder;
2348 const struct lp_type type = bld->type;
2349
2350 assert(type.floating);
2351 assert(lp_check_value(type, a));
2352
2353 if (arch_rounding_available(type)) {
2354 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2355 }
2356 else {
2357 const struct lp_type type = bld->type;
2358 struct lp_type inttype;
2359 struct lp_build_context intbld;
2360 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2361 LLVMValueRef trunc, res, anosign, mask, tmp;
2362 LLVMTypeRef int_vec_type = bld->int_vec_type;
2363 LLVMTypeRef vec_type = bld->vec_type;
2364
2365 if (type.width != 32) {
2366 char intrinsic[32];
2367 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2368 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2369 }
2370
2371 assert(type.width == 32); /* might want to handle doubles at some point */
2372
2373 inttype = type;
2374 inttype.floating = 0;
2375 lp_build_context_init(&intbld, bld->gallivm, inttype);
2376
2377 /* round by truncation */
2378 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2379 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2380
2381 /*
2382 * fix values if rounding is wrong (for non-special cases)
2383 * - this is the case if trunc < a
2384 */
2385 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2386 /* tmp = trunc < a ? 1.0 : 0.0 */
2387 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2388 tmp = lp_build_and(&intbld, mask, tmp);
2389 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2390 res = lp_build_add(bld, trunc, tmp);
2391
2392 /* mask out sign bit */
2393 anosign = lp_build_abs(bld, a);
2394 /*
2395 * mask out all values if anosign > 2^24
2396 * This should work both for large ints (all rounding is no-op for them
2397 * because such floats are always exact) as well as special cases like
2398 * NaNs, Infs (taking advantage of the fact they use max exponent).
2399 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2400 */
2401 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2402 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2403 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2404 return lp_build_select(bld, mask, a, res);
2405 }
2406 }
2407
2408
2409 /**
2410 * Return fractional part of 'a' computed as a - floor(a)
2411 * Typically used in texture coord arithmetic.
2412 */
2413 LLVMValueRef
2414 lp_build_fract(struct lp_build_context *bld,
2415 LLVMValueRef a)
2416 {
2417 assert(bld->type.floating);
2418 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2419 }
2420
2421
2422 /**
2423 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2424 * against 0.99999(9). (Will also return that value for NaNs.)
2425 */
2426 static inline LLVMValueRef
2427 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2428 {
2429 LLVMValueRef max;
2430
2431 /* this is the largest number smaller than 1.0 representable as float */
2432 max = lp_build_const_vec(bld->gallivm, bld->type,
2433 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2434 return lp_build_min_ext(bld, fract, max,
2435 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2436 }
2437
2438
2439 /**
2440 * Same as lp_build_fract, but guarantees that the result is always smaller
2441 * than one. Will also return the smaller-than-one value for infs, NaNs.
2442 */
2443 LLVMValueRef
2444 lp_build_fract_safe(struct lp_build_context *bld,
2445 LLVMValueRef a)
2446 {
2447 return clamp_fract(bld, lp_build_fract(bld, a));
2448 }
2449
2450
2451 /**
2452 * Return the integer part of a float (vector) value (== round toward zero).
2453 * The returned value is an integer (vector).
2454 * Ex: itrunc(-1.5) = -1
2455 */
2456 LLVMValueRef
2457 lp_build_itrunc(struct lp_build_context *bld,
2458 LLVMValueRef a)
2459 {
2460 LLVMBuilderRef builder = bld->gallivm->builder;
2461 const struct lp_type type = bld->type;
2462 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2463
2464 assert(type.floating);
2465 assert(lp_check_value(type, a));
2466
2467 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2468 }
2469
2470
2471 /**
2472 * Return float (vector) rounded to nearest integer (vector). The returned
2473 * value is an integer (vector).
2474 * Ex: iround(0.9) = 1
2475 * Ex: iround(-1.5) = -2
2476 */
2477 LLVMValueRef
2478 lp_build_iround(struct lp_build_context *bld,
2479 LLVMValueRef a)
2480 {
2481 LLVMBuilderRef builder = bld->gallivm->builder;
2482 const struct lp_type type = bld->type;
2483 LLVMTypeRef int_vec_type = bld->int_vec_type;
2484 LLVMValueRef res;
2485
2486 assert(type.floating);
2487
2488 assert(lp_check_value(type, a));
2489
2490 if ((util_cpu_caps.has_sse2 &&
2491 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2492 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2493 return lp_build_iround_nearest_sse2(bld, a);
2494 }
2495 if (arch_rounding_available(type)) {
2496 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2497 }
2498 else {
2499 LLVMValueRef half;
2500
2501 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2502
2503 if (type.sign) {
2504 LLVMTypeRef vec_type = bld->vec_type;
2505 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2506 (unsigned long long)1 << (type.width - 1));
2507 LLVMValueRef sign;
2508
2509 /* get sign bit */
2510 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2511 sign = LLVMBuildAnd(builder, sign, mask, "");
2512
2513 /* sign * 0.5 */
2514 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2515 half = LLVMBuildOr(builder, sign, half, "");
2516 half = LLVMBuildBitCast(builder, half, vec_type, "");
2517 }
2518
2519 res = LLVMBuildFAdd(builder, a, half, "");
2520 }
2521
2522 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2523
2524 return res;
2525 }
2526
2527
2528 /**
2529 * Return floor of float (vector), result is an int (vector)
2530 * Ex: ifloor(1.1) = 1.0
2531 * Ex: ifloor(-1.1) = -2.0
2532 */
2533 LLVMValueRef
2534 lp_build_ifloor(struct lp_build_context *bld,
2535 LLVMValueRef a)
2536 {
2537 LLVMBuilderRef builder = bld->gallivm->builder;
2538 const struct lp_type type = bld->type;
2539 LLVMTypeRef int_vec_type = bld->int_vec_type;
2540 LLVMValueRef res;
2541
2542 assert(type.floating);
2543 assert(lp_check_value(type, a));
2544
2545 res = a;
2546 if (type.sign) {
2547 if (arch_rounding_available(type)) {
2548 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2549 }
2550 else {
2551 struct lp_type inttype;
2552 struct lp_build_context intbld;
2553 LLVMValueRef trunc, itrunc, mask;
2554
2555 assert(type.floating);
2556 assert(lp_check_value(type, a));
2557
2558 inttype = type;
2559 inttype.floating = 0;
2560 lp_build_context_init(&intbld, bld->gallivm, inttype);
2561
2562 /* round by truncation */
2563 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2564 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2565
2566 /*
2567 * fix values if rounding is wrong (for non-special cases)
2568 * - this is the case if trunc > a
2569 * The results of doing this with NaNs, very large values etc.
2570 * are undefined but this seems to be the case anyway.
2571 */
2572 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2573 /* cheapie minus one with mask since the mask is minus one / zero */
2574 return lp_build_add(&intbld, itrunc, mask);
2575 }
2576 }
2577
2578 /* round to nearest (toward zero) */
2579 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2580
2581 return res;
2582 }
2583
2584
2585 /**
2586 * Return ceiling of float (vector), returning int (vector).
2587 * Ex: iceil( 1.1) = 2
2588 * Ex: iceil(-1.1) = -1
2589 */
2590 LLVMValueRef
2591 lp_build_iceil(struct lp_build_context *bld,
2592 LLVMValueRef a)
2593 {
2594 LLVMBuilderRef builder = bld->gallivm->builder;
2595 const struct lp_type type = bld->type;
2596 LLVMTypeRef int_vec_type = bld->int_vec_type;
2597 LLVMValueRef res;
2598
2599 assert(type.floating);
2600 assert(lp_check_value(type, a));
2601
2602 if (arch_rounding_available(type)) {
2603 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2604 }
2605 else {
2606 struct lp_type inttype;
2607 struct lp_build_context intbld;
2608 LLVMValueRef trunc, itrunc, mask;
2609
2610 assert(type.floating);
2611 assert(lp_check_value(type, a));
2612
2613 inttype = type;
2614 inttype.floating = 0;
2615 lp_build_context_init(&intbld, bld->gallivm, inttype);
2616
2617 /* round by truncation */
2618 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2619 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2620
2621 /*
2622 * fix values if rounding is wrong (for non-special cases)
2623 * - this is the case if trunc < a
2624 * The results of doing this with NaNs, very large values etc.
2625 * are undefined but this seems to be the case anyway.
2626 */
2627 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2628 /* cheapie plus one with mask since the mask is minus one / zero */
2629 return lp_build_sub(&intbld, itrunc, mask);
2630 }
2631
2632 /* round to nearest (toward zero) */
2633 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2634
2635 return res;
2636 }
2637
2638
2639 /**
2640 * Combined ifloor() & fract().
2641 *
2642 * Preferred to calling the functions separately, as it will ensure that the
2643 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2644 */
2645 void
2646 lp_build_ifloor_fract(struct lp_build_context *bld,
2647 LLVMValueRef a,
2648 LLVMValueRef *out_ipart,
2649 LLVMValueRef *out_fpart)
2650 {
2651 LLVMBuilderRef builder = bld->gallivm->builder;
2652 const struct lp_type type = bld->type;
2653 LLVMValueRef ipart;
2654
2655 assert(type.floating);
2656 assert(lp_check_value(type, a));
2657
2658 if (arch_rounding_available(type)) {
2659 /*
2660 * floor() is easier.
2661 */
2662
2663 ipart = lp_build_floor(bld, a);
2664 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2665 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2666 }
2667 else {
2668 /*
2669 * ifloor() is easier.
2670 */
2671
2672 *out_ipart = lp_build_ifloor(bld, a);
2673 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2674 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2675 }
2676 }
2677
2678
2679 /**
2680 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2681 * always smaller than one.
2682 */
2683 void
2684 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2685 LLVMValueRef a,
2686 LLVMValueRef *out_ipart,
2687 LLVMValueRef *out_fpart)
2688 {
2689 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2690 *out_fpart = clamp_fract(bld, *out_fpart);
2691 }
2692
2693
2694 LLVMValueRef
2695 lp_build_sqrt(struct lp_build_context *bld,
2696 LLVMValueRef a)
2697 {
2698 LLVMBuilderRef builder = bld->gallivm->builder;
2699 const struct lp_type type = bld->type;
2700 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2701 char intrinsic[32];
2702
2703 assert(lp_check_value(type, a));
2704
2705 assert(type.floating);
2706 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2707
2708 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2709 }
2710
2711
2712 /**
2713 * Do one Newton-Raphson step to improve reciprocate precision:
2714 *
2715 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2716 *
2717 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2718 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2719 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2720 * halo. It would be necessary to clamp the argument to prevent this.
2721 *
2722 * See also:
2723 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2724 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2725 */
2726 static inline LLVMValueRef
2727 lp_build_rcp_refine(struct lp_build_context *bld,
2728 LLVMValueRef a,
2729 LLVMValueRef rcp_a)
2730 {
2731 LLVMBuilderRef builder = bld->gallivm->builder;
2732 LLVMValueRef neg_a;
2733 LLVMValueRef res;
2734
2735 neg_a = LLVMBuildFNeg(builder, a, "");
2736 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2737 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2738
2739 return res;
2740 }
2741
2742
2743 LLVMValueRef
2744 lp_build_rcp(struct lp_build_context *bld,
2745 LLVMValueRef a)
2746 {
2747 LLVMBuilderRef builder = bld->gallivm->builder;
2748 const struct lp_type type = bld->type;
2749
2750 assert(lp_check_value(type, a));
2751
2752 if(a == bld->zero)
2753 return bld->undef;
2754 if(a == bld->one)
2755 return bld->one;
2756 if(a == bld->undef)
2757 return bld->undef;
2758
2759 assert(type.floating);
2760
2761 if(LLVMIsConstant(a))
2762 return LLVMConstFDiv(bld->one, a);
2763
2764 /*
2765 * We don't use RCPPS because:
2766 * - it only has 10bits of precision
2767 * - it doesn't even get the reciprocate of 1.0 exactly
2768 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2769 * - for recent processors the benefit over DIVPS is marginal, a case
2770 * dependent
2771 *
2772 * We could still use it on certain processors if benchmarks show that the
2773 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2774 * particular uses that require less workarounds.
2775 */
2776
2777 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2778 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2779 const unsigned num_iterations = 0;
2780 LLVMValueRef res;
2781 unsigned i;
2782 const char *intrinsic = NULL;
2783
2784 if (type.length == 4) {
2785 intrinsic = "llvm.x86.sse.rcp.ps";
2786 }
2787 else {
2788 intrinsic = "llvm.x86.avx.rcp.ps.256";
2789 }
2790
2791 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2792
2793 for (i = 0; i < num_iterations; ++i) {
2794 res = lp_build_rcp_refine(bld, a, res);
2795 }
2796
2797 return res;
2798 }
2799
2800 return LLVMBuildFDiv(builder, bld->one, a, "");
2801 }
2802
2803
2804 /**
2805 * Do one Newton-Raphson step to improve rsqrt precision:
2806 *
2807 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2808 *
2809 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2810 */
2811 static inline LLVMValueRef
2812 lp_build_rsqrt_refine(struct lp_build_context *bld,
2813 LLVMValueRef a,
2814 LLVMValueRef rsqrt_a)
2815 {
2816 LLVMBuilderRef builder = bld->gallivm->builder;
2817 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2818 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2819 LLVMValueRef res;
2820
2821 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2822 res = LLVMBuildFMul(builder, a, res, "");
2823 res = LLVMBuildFSub(builder, three, res, "");
2824 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2825 res = LLVMBuildFMul(builder, half, res, "");
2826
2827 return res;
2828 }
2829
2830
2831 /**
2832 * Generate 1/sqrt(a).
2833 * Result is undefined for values < 0, infinity for +0.
2834 */
2835 LLVMValueRef
2836 lp_build_rsqrt(struct lp_build_context *bld,
2837 LLVMValueRef a)
2838 {
2839 const struct lp_type type = bld->type;
2840
2841 assert(lp_check_value(type, a));
2842
2843 assert(type.floating);
2844
2845 /*
2846 * This should be faster but all denormals will end up as infinity.
2847 */
2848 if (0 && lp_build_fast_rsqrt_available(type)) {
2849 const unsigned num_iterations = 1;
2850 LLVMValueRef res;
2851 unsigned i;
2852
2853 /* rsqrt(1.0) != 1.0 here */
2854 res = lp_build_fast_rsqrt(bld, a);
2855
2856 if (num_iterations) {
2857 /*
2858 * Newton-Raphson will result in NaN instead of infinity for zero,
2859 * and NaN instead of zero for infinity.
2860 * Also, need to ensure rsqrt(1.0) == 1.0.
2861 * All numbers smaller than FLT_MIN will result in +infinity
2862 * (rsqrtps treats all denormals as zero).
2863 */
2864 LLVMValueRef cmp;
2865 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2866 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2867
2868 for (i = 0; i < num_iterations; ++i) {
2869 res = lp_build_rsqrt_refine(bld, a, res);
2870 }
2871 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2872 res = lp_build_select(bld, cmp, inf, res);
2873 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2874 res = lp_build_select(bld, cmp, bld->zero, res);
2875 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2876 res = lp_build_select(bld, cmp, bld->one, res);
2877 }
2878
2879 return res;
2880 }
2881
2882 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2883 }
2884
2885 /**
2886 * If there's a fast (inaccurate) rsqrt instruction available
2887 * (caller may want to avoid to call rsqrt_fast if it's not available,
2888 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2889 * unavailable it would result in sqrt/div/mul so obviously
2890 * much better to just call sqrt, skipping both div and mul).
2891 */
2892 boolean
2893 lp_build_fast_rsqrt_available(struct lp_type type)
2894 {
2895 assert(type.floating);
2896
2897 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2898 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2899 return true;
2900 }
2901 return false;
2902 }
2903
2904
2905 /**
2906 * Generate 1/sqrt(a).
2907 * Result is undefined for values < 0, infinity for +0.
2908 * Precision is limited, only ~10 bits guaranteed
2909 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2910 */
2911 LLVMValueRef
2912 lp_build_fast_rsqrt(struct lp_build_context *bld,
2913 LLVMValueRef a)
2914 {
2915 LLVMBuilderRef builder = bld->gallivm->builder;
2916 const struct lp_type type = bld->type;
2917
2918 assert(lp_check_value(type, a));
2919
2920 if (lp_build_fast_rsqrt_available(type)) {
2921 const char *intrinsic = NULL;
2922
2923 if (type.length == 4) {
2924 intrinsic = "llvm.x86.sse.rsqrt.ps";
2925 }
2926 else {
2927 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2928 }
2929 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2930 }
2931 else {
2932 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2933 }
2934 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2935 }
2936
2937
2938 /**
2939 * Generate sin(a) or cos(a) using polynomial approximation.
2940 * TODO: it might be worth recognizing sin and cos using same source
2941 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2942 * would be way cheaper than calculating (nearly) everything twice...
2943 * Not sure it's common enough to be worth bothering however, scs
2944 * opcode could also benefit from calculating both though.
2945 */
2946 static LLVMValueRef
2947 lp_build_sin_or_cos(struct lp_build_context *bld,
2948 LLVMValueRef a,
2949 boolean cos)
2950 {
2951 struct gallivm_state *gallivm = bld->gallivm;
2952 LLVMBuilderRef b = gallivm->builder;
2953 struct lp_type int_type = lp_int_type(bld->type);
2954
2955 /*
2956 * take the absolute value,
2957 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2958 */
2959
2960 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2961 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2962
2963 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2964 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2965
2966 /*
2967 * scale by 4/Pi
2968 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2969 */
2970
2971 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2972 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2973
2974 /*
2975 * store the integer part of y in mm0
2976 * emm2 = _mm_cvttps_epi32(y);
2977 */
2978
2979 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2980
2981 /*
2982 * j=(j+1) & (~1) (see the cephes sources)
2983 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2984 */
2985
2986 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2987 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2988 /*
2989 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2990 */
2991 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2992 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2993
2994 /*
2995 * y = _mm_cvtepi32_ps(emm2);
2996 */
2997 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2998
2999 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
3000 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
3001 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
3002 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
3003
3004 /*
3005 * Argument used for poly selection and sign bit determination
3006 * is different for sin vs. cos.
3007 */
3008 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
3009 emm2_and;
3010
3011 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
3012 LLVMBuildNot(b, emm2_2, ""), ""),
3013 const_29, "sign_bit") :
3014 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
3015 LLVMBuildShl(b, emm2_add,
3016 const_29, ""), ""),
3017 sign_mask, "sign_bit");
3018
3019 /*
3020 * get the polynom selection mask
3021 * there is one polynom for 0 <= x <= Pi/4
3022 * and another one for Pi/4<x<=Pi/2
3023 * Both branches will be computed.
3024 *
3025 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3026 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3027 */
3028
3029 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3030 LLVMValueRef poly_mask = lp_build_compare(gallivm,
3031 int_type, PIPE_FUNC_EQUAL,
3032 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3033
3034 /*
3035 * _PS_CONST(minus_cephes_DP1, -0.78515625);
3036 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3037 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3038 */
3039 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3040 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3041 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3042
3043 /*
3044 * The magic pass: "Extended precision modular arithmetic"
3045 * x = ((x - y * DP1) - y * DP2) - y * DP3;
3046 */
3047 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3048 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3049 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3050
3051 /*
3052 * Evaluate the first polynom (0 <= x <= Pi/4)
3053 *
3054 * z = _mm_mul_ps(x,x);
3055 */
3056 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3057
3058 /*
3059 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3060 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3061 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3062 */
3063 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3064 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3065 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3066
3067 /*
3068 * y = *(v4sf*)_ps_coscof_p0;
3069 * y = _mm_mul_ps(y, z);
3070 */
3071 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3072 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3073 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3074 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3075
3076
3077 /*
3078 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3079 * y = _mm_sub_ps(y, tmp);
3080 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3081 */
3082 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3083 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3084 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3085 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3086 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3087
3088 /*
3089 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3090 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3091 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3092 */
3093 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3094 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3095 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3096
3097 /*
3098 * Evaluate the second polynom (Pi/4 <= x <= 0)
3099 *
3100 * y2 = *(v4sf*)_ps_sincof_p0;
3101 * y2 = _mm_mul_ps(y2, z);
3102 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3103 * y2 = _mm_mul_ps(y2, z);
3104 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3105 * y2 = _mm_mul_ps(y2, z);
3106 * y2 = _mm_mul_ps(y2, x);
3107 * y2 = _mm_add_ps(y2, x);
3108 */
3109
3110 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3111 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3112 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3113 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3114
3115 /*
3116 * select the correct result from the two polynoms
3117 * xmm3 = poly_mask;
3118 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3119 * y = _mm_andnot_ps(xmm3, y);
3120 * y = _mm_or_ps(y,y2);
3121 */
3122 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3123 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3124 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3125 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3126 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3127 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3128
3129 /*
3130 * update the sign
3131 * y = _mm_xor_ps(y, sign_bit);
3132 */
3133 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3134 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3135
3136 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3137
3138 /* clamp output to be within [-1, 1] */
3139 y_result = lp_build_clamp(bld, y_result,
3140 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3141 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3142 /* If a is -inf, inf or NaN then return NaN */
3143 y_result = lp_build_select(bld, isfinite, y_result,
3144 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3145 return y_result;
3146 }
3147
3148
3149 /**
3150 * Generate sin(a)
3151 */
3152 LLVMValueRef
3153 lp_build_sin(struct lp_build_context *bld,
3154 LLVMValueRef a)
3155 {
3156 return lp_build_sin_or_cos(bld, a, FALSE);
3157 }
3158
3159
3160 /**
3161 * Generate cos(a)
3162 */
3163 LLVMValueRef
3164 lp_build_cos(struct lp_build_context *bld,
3165 LLVMValueRef a)
3166 {
3167 return lp_build_sin_or_cos(bld, a, TRUE);
3168 }
3169
3170
3171 /**
3172 * Generate pow(x, y)
3173 */
3174 LLVMValueRef
3175 lp_build_pow(struct lp_build_context *bld,
3176 LLVMValueRef x,
3177 LLVMValueRef y)
3178 {
3179 /* TODO: optimize the constant case */
3180 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3181 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3182 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3183 __FUNCTION__);
3184 }
3185
3186 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3187 }
3188
3189
3190 /**
3191 * Generate exp(x)
3192 */
3193 LLVMValueRef
3194 lp_build_exp(struct lp_build_context *bld,
3195 LLVMValueRef x)
3196 {
3197 /* log2(e) = 1/log(2) */
3198 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3199 1.4426950408889634);
3200
3201 assert(lp_check_value(bld->type, x));
3202
3203 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3204 }
3205
3206
3207 /**
3208 * Generate log(x)
3209 * Behavior is undefined with infs, 0s and nans
3210 */
3211 LLVMValueRef
3212 lp_build_log(struct lp_build_context *bld,
3213 LLVMValueRef x)
3214 {
3215 /* log(2) */
3216 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3217 0.69314718055994529);
3218
3219 assert(lp_check_value(bld->type, x));
3220
3221 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3222 }
3223
3224 /**
3225 * Generate log(x) that handles edge cases (infs, 0s and nans)
3226 */
3227 LLVMValueRef
3228 lp_build_log_safe(struct lp_build_context *bld,
3229 LLVMValueRef x)
3230 {
3231 /* log(2) */
3232 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3233 0.69314718055994529);
3234
3235 assert(lp_check_value(bld->type, x));
3236
3237 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3238 }
3239
3240
3241 /**
3242 * Generate polynomial.
3243 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3244 */
3245 LLVMValueRef
3246 lp_build_polynomial(struct lp_build_context *bld,
3247 LLVMValueRef x,
3248 const double *coeffs,
3249 unsigned num_coeffs)
3250 {
3251 const struct lp_type type = bld->type;
3252 LLVMValueRef even = NULL, odd = NULL;
3253 LLVMValueRef x2;
3254 unsigned i;
3255
3256 assert(lp_check_value(bld->type, x));
3257
3258 /* TODO: optimize the constant case */
3259 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3260 LLVMIsConstant(x)) {
3261 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3262 __FUNCTION__);
3263 }
3264
3265 /*
3266 * Calculate odd and even terms seperately to decrease data dependency
3267 * Ex:
3268 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3269 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3270 */
3271 x2 = lp_build_mul(bld, x, x);
3272
3273 for (i = num_coeffs; i--; ) {
3274 LLVMValueRef coeff;
3275
3276 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3277
3278 if (i % 2 == 0) {
3279 if (even)
3280 even = lp_build_mad(bld, x2, even, coeff);
3281 else
3282 even = coeff;
3283 } else {
3284 if (odd)
3285 odd = lp_build_mad(bld, x2, odd, coeff);
3286 else
3287 odd = coeff;
3288 }
3289 }
3290
3291 if (odd)
3292 return lp_build_mad(bld, odd, x, even);
3293 else if (even)
3294 return even;
3295 else
3296 return bld->undef;
3297 }
3298
3299
3300 /**
3301 * Minimax polynomial fit of 2**x, in range [0, 1[
3302 */
3303 const double lp_build_exp2_polynomial[] = {
3304 #if EXP_POLY_DEGREE == 5
3305 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3306 0.693153073200168932794,
3307 0.240153617044375388211,
3308 0.0558263180532956664775,
3309 0.00898934009049466391101,
3310 0.00187757667519147912699
3311 #elif EXP_POLY_DEGREE == 4
3312 1.00000259337069434683,
3313 0.693003834469974940458,
3314 0.24144275689150793076,
3315 0.0520114606103070150235,
3316 0.0135341679161270268764
3317 #elif EXP_POLY_DEGREE == 3
3318 0.999925218562710312959,
3319 0.695833540494823811697,
3320 0.226067155427249155588,
3321 0.0780245226406372992967
3322 #elif EXP_POLY_DEGREE == 2
3323 1.00172476321474503578,
3324 0.657636275736077639316,
3325 0.33718943461968720704
3326 #else
3327 #error
3328 #endif
3329 };
3330
3331
3332 LLVMValueRef
3333 lp_build_exp2(struct lp_build_context *bld,
3334 LLVMValueRef x)
3335 {
3336 LLVMBuilderRef builder = bld->gallivm->builder;
3337 const struct lp_type type = bld->type;
3338 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3339 LLVMValueRef ipart = NULL;
3340 LLVMValueRef fpart = NULL;
3341 LLVMValueRef expipart = NULL;
3342 LLVMValueRef expfpart = NULL;
3343 LLVMValueRef res = NULL;
3344
3345 assert(lp_check_value(bld->type, x));
3346
3347 /* TODO: optimize the constant case */
3348 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3349 LLVMIsConstant(x)) {
3350 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3351 __FUNCTION__);
3352 }
3353
3354 assert(type.floating && type.width == 32);
3355
3356 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3357 * the result is INF and if it's smaller than -126.9 the result is 0 */
3358 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3359 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3360 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3361 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3362
3363 /* ipart = floor(x) */
3364 /* fpart = x - ipart */
3365 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3366
3367 /* expipart = (float) (1 << ipart) */
3368 expipart = LLVMBuildAdd(builder, ipart,
3369 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3370 expipart = LLVMBuildShl(builder, expipart,
3371 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3372 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3373
3374 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3375 ARRAY_SIZE(lp_build_exp2_polynomial));
3376
3377 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3378
3379 return res;
3380 }
3381
3382
3383
3384 /**
3385 * Extract the exponent of a IEEE-754 floating point value.
3386 *
3387 * Optionally apply an integer bias.
3388 *
3389 * Result is an integer value with
3390 *
3391 * ifloor(log2(x)) + bias
3392 */
3393 LLVMValueRef
3394 lp_build_extract_exponent(struct lp_build_context *bld,
3395 LLVMValueRef x,
3396 int bias)
3397 {
3398 LLVMBuilderRef builder = bld->gallivm->builder;
3399 const struct lp_type type = bld->type;
3400 unsigned mantissa = lp_mantissa(type);
3401 LLVMValueRef res;
3402
3403 assert(type.floating);
3404
3405 assert(lp_check_value(bld->type, x));
3406
3407 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3408
3409 res = LLVMBuildLShr(builder, x,
3410 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3411 res = LLVMBuildAnd(builder, res,
3412 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3413 res = LLVMBuildSub(builder, res,
3414 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3415
3416 return res;
3417 }
3418
3419
3420 /**
3421 * Extract the mantissa of the a floating.
3422 *
3423 * Result is a floating point value with
3424 *
3425 * x / floor(log2(x))
3426 */
3427 LLVMValueRef
3428 lp_build_extract_mantissa(struct lp_build_context *bld,
3429 LLVMValueRef x)
3430 {
3431 LLVMBuilderRef builder = bld->gallivm->builder;
3432 const struct lp_type type = bld->type;
3433 unsigned mantissa = lp_mantissa(type);
3434 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3435 (1ULL << mantissa) - 1);
3436 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3437 LLVMValueRef res;
3438
3439 assert(lp_check_value(bld->type, x));
3440
3441 assert(type.floating);
3442
3443 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3444
3445 /* res = x / 2**ipart */
3446 res = LLVMBuildAnd(builder, x, mantmask, "");
3447 res = LLVMBuildOr(builder, res, one, "");
3448 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3449
3450 return res;
3451 }
3452
3453
3454
3455 /**
3456 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3457 * These coefficients can be generate with
3458 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3459 */
3460 const double lp_build_log2_polynomial[] = {
3461 #if LOG_POLY_DEGREE == 5
3462 2.88539008148777786488L,
3463 0.961796878841293367824L,
3464 0.577058946784739859012L,
3465 0.412914355135828735411L,
3466 0.308591899232910175289L,
3467 0.352376952300281371868L,
3468 #elif LOG_POLY_DEGREE == 4
3469 2.88539009343309178325L,
3470 0.961791550404184197881L,
3471 0.577440339438736392009L,
3472 0.403343858251329912514L,
3473 0.406718052498846252698L,
3474 #elif LOG_POLY_DEGREE == 3
3475 2.88538959748872753838L,
3476 0.961932915889597772928L,
3477 0.571118517972136195241L,
3478 0.493997535084709500285L,
3479 #else
3480 #error
3481 #endif
3482 };
3483
3484 /**
3485 * See http://www.devmaster.net/forums/showthread.php?p=43580
3486 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3487 * http://www.nezumi.demon.co.uk/consult/logx.htm
3488 *
3489 * If handle_edge_cases is true the function will perform computations
3490 * to match the required D3D10+ behavior for each of the edge cases.
3491 * That means that if input is:
3492 * - less than zero (to and including -inf) then NaN will be returned
3493 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3494 * - +infinity, then +infinity will be returned
3495 * - NaN, then NaN will be returned
3496 *
3497 * Those checks are fairly expensive so if you don't need them make sure
3498 * handle_edge_cases is false.
3499 */
3500 void
3501 lp_build_log2_approx(struct lp_build_context *bld,
3502 LLVMValueRef x,
3503 LLVMValueRef *p_exp,
3504 LLVMValueRef *p_floor_log2,
3505 LLVMValueRef *p_log2,
3506 boolean handle_edge_cases)
3507 {
3508 LLVMBuilderRef builder = bld->gallivm->builder;
3509 const struct lp_type type = bld->type;
3510 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3511 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3512
3513 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3514 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3515 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3516
3517 LLVMValueRef i = NULL;
3518 LLVMValueRef y = NULL;
3519 LLVMValueRef z = NULL;
3520 LLVMValueRef exp = NULL;
3521 LLVMValueRef mant = NULL;
3522 LLVMValueRef logexp = NULL;
3523 LLVMValueRef p_z = NULL;
3524 LLVMValueRef res = NULL;
3525
3526 assert(lp_check_value(bld->type, x));
3527
3528 if(p_exp || p_floor_log2 || p_log2) {
3529 /* TODO: optimize the constant case */
3530 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3531 LLVMIsConstant(x)) {
3532 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3533 __FUNCTION__);
3534 }
3535
3536 assert(type.floating && type.width == 32);
3537
3538 /*
3539 * We don't explicitly handle denormalized numbers. They will yield a
3540 * result in the neighbourhood of -127, which appears to be adequate
3541 * enough.
3542 */
3543
3544 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3545
3546 /* exp = (float) exponent(x) */
3547 exp = LLVMBuildAnd(builder, i, expmask, "");
3548 }
3549
3550 if(p_floor_log2 || p_log2) {
3551 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3552 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3553 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3554 }
3555
3556 if (p_log2) {
3557 /* mant = 1 + (float) mantissa(x) */
3558 mant = LLVMBuildAnd(builder, i, mantmask, "");
3559 mant = LLVMBuildOr(builder, mant, one, "");
3560 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3561
3562 /* y = (mant - 1) / (mant + 1) */
3563 y = lp_build_div(bld,
3564 lp_build_sub(bld, mant, bld->one),
3565 lp_build_add(bld, mant, bld->one)
3566 );
3567
3568 /* z = y^2 */
3569 z = lp_build_mul(bld, y, y);
3570
3571 /* compute P(z) */
3572 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3573 ARRAY_SIZE(lp_build_log2_polynomial));
3574
3575 /* y * P(z) + logexp */
3576 res = lp_build_mad(bld, y, p_z, logexp);
3577
3578 if (type.floating && handle_edge_cases) {
3579 LLVMValueRef negmask, infmask, zmask;
3580 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3581 lp_build_const_vec(bld->gallivm, type, 0.0f));
3582 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3583 lp_build_const_vec(bld->gallivm, type, 0.0f));
3584 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3585 lp_build_const_vec(bld->gallivm, type, INFINITY));
3586
3587 /* If x is qual to inf make sure we return inf */
3588 res = lp_build_select(bld, infmask,
3589 lp_build_const_vec(bld->gallivm, type, INFINITY),
3590 res);
3591 /* If x is qual to 0, return -inf */
3592 res = lp_build_select(bld, zmask,
3593 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3594 res);
3595 /* If x is nan or less than 0, return nan */
3596 res = lp_build_select(bld, negmask,
3597 lp_build_const_vec(bld->gallivm, type, NAN),
3598 res);
3599 }
3600 }
3601
3602 if (p_exp) {
3603 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3604 *p_exp = exp;
3605 }
3606
3607 if (p_floor_log2)
3608 *p_floor_log2 = logexp;
3609
3610 if (p_log2)
3611 *p_log2 = res;
3612 }
3613
3614
3615 /*
3616 * log2 implementation which doesn't have special code to
3617 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3618 * the results for those cases are undefined.
3619 */
3620 LLVMValueRef
3621 lp_build_log2(struct lp_build_context *bld,
3622 LLVMValueRef x)
3623 {
3624 LLVMValueRef res;
3625 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3626 return res;
3627 }
3628
3629 /*
3630 * Version of log2 which handles all edge cases.
3631 * Look at documentation of lp_build_log2_approx for
3632 * description of the behavior for each of the edge cases.
3633 */
3634 LLVMValueRef
3635 lp_build_log2_safe(struct lp_build_context *bld,
3636 LLVMValueRef x)
3637 {
3638 LLVMValueRef res;
3639 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3640 return res;
3641 }
3642
3643
3644 /**
3645 * Faster (and less accurate) log2.
3646 *
3647 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3648 *
3649 * Piece-wise linear approximation, with exact results when x is a
3650 * power of two.
3651 *
3652 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3653 */
3654 LLVMValueRef
3655 lp_build_fast_log2(struct lp_build_context *bld,
3656 LLVMValueRef x)
3657 {
3658 LLVMBuilderRef builder = bld->gallivm->builder;
3659 LLVMValueRef ipart;
3660 LLVMValueRef fpart;
3661
3662 assert(lp_check_value(bld->type, x));
3663
3664 assert(bld->type.floating);
3665
3666 /* ipart = floor(log2(x)) - 1 */
3667 ipart = lp_build_extract_exponent(bld, x, -1);
3668 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3669
3670 /* fpart = x / 2**ipart */
3671 fpart = lp_build_extract_mantissa(bld, x);
3672
3673 /* ipart + fpart */
3674 return LLVMBuildFAdd(builder, ipart, fpart, "");
3675 }
3676
3677
3678 /**
3679 * Fast implementation of iround(log2(x)).
3680 *
3681 * Not an approximation -- it should give accurate results all the time.
3682 */
3683 LLVMValueRef
3684 lp_build_ilog2(struct lp_build_context *bld,
3685 LLVMValueRef x)
3686 {
3687 LLVMBuilderRef builder = bld->gallivm->builder;
3688 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3689 LLVMValueRef ipart;
3690
3691 assert(bld->type.floating);
3692
3693 assert(lp_check_value(bld->type, x));
3694
3695 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3696 x = LLVMBuildFMul(builder, x, sqrt2, "");
3697
3698 /* ipart = floor(log2(x) + 0.5) */
3699 ipart = lp_build_extract_exponent(bld, x, 0);
3700
3701 return ipart;
3702 }
3703
3704 LLVMValueRef
3705 lp_build_mod(struct lp_build_context *bld,
3706 LLVMValueRef x,
3707 LLVMValueRef y)
3708 {
3709 LLVMBuilderRef builder = bld->gallivm->builder;
3710 LLVMValueRef res;
3711 const struct lp_type type = bld->type;
3712
3713 assert(lp_check_value(type, x));
3714 assert(lp_check_value(type, y));
3715
3716 if (type.floating)
3717 res = LLVMBuildFRem(builder, x, y, "");
3718 else if (type.sign)
3719 res = LLVMBuildSRem(builder, x, y, "");
3720 else
3721 res = LLVMBuildURem(builder, x, y, "");
3722 return res;
3723 }
3724
3725
3726 /*
3727 * For floating inputs it creates and returns a mask
3728 * which is all 1's for channels which are NaN.
3729 * Channels inside x which are not NaN will be 0.
3730 */
3731 LLVMValueRef
3732 lp_build_isnan(struct lp_build_context *bld,
3733 LLVMValueRef x)
3734 {
3735 LLVMValueRef mask;
3736 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3737
3738 assert(bld->type.floating);
3739 assert(lp_check_value(bld->type, x));
3740
3741 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3742 "isnotnan");
3743 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3744 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3745 return mask;
3746 }
3747
3748 /* Returns all 1's for floating point numbers that are
3749 * finite numbers and returns all zeros for -inf,
3750 * inf and nan's */
3751 LLVMValueRef
3752 lp_build_isfinite(struct lp_build_context *bld,
3753 LLVMValueRef x)
3754 {
3755 LLVMBuilderRef builder = bld->gallivm->builder;
3756 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3757 struct lp_type int_type = lp_int_type(bld->type);
3758 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3759 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3760 0x7f800000);
3761
3762 if (!bld->type.floating) {
3763 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3764 }
3765 assert(bld->type.floating);
3766 assert(lp_check_value(bld->type, x));
3767 assert(bld->type.width == 32);
3768
3769 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3770 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3771 intx, infornan32);
3772 }
3773
3774 /*
3775 * Returns true if the number is nan or inf and false otherwise.
3776 * The input has to be a floating point vector.
3777 */
3778 LLVMValueRef
3779 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3780 const struct lp_type type,
3781 LLVMValueRef x)
3782 {
3783 LLVMBuilderRef builder = gallivm->builder;
3784 struct lp_type int_type = lp_int_type(type);
3785 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3786 0x7f800000);
3787 LLVMValueRef ret;
3788
3789 assert(type.floating);
3790
3791 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3792 ret = LLVMBuildAnd(builder, ret, const0, "");
3793 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3794 ret, const0);
3795
3796 return ret;
3797 }
3798
3799
3800 LLVMValueRef
3801 lp_build_fpstate_get(struct gallivm_state *gallivm)
3802 {
3803 if (util_cpu_caps.has_sse) {
3804 LLVMBuilderRef builder = gallivm->builder;
3805 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3806 gallivm,
3807 LLVMInt32TypeInContext(gallivm->context),
3808 "mxcsr_ptr");
3809 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3810 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3811 lp_build_intrinsic(builder,
3812 "llvm.x86.sse.stmxcsr",
3813 LLVMVoidTypeInContext(gallivm->context),
3814 &mxcsr_ptr8, 1, 0);
3815 return mxcsr_ptr;
3816 }
3817 return 0;
3818 }
3819
3820 void
3821 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3822 boolean zero)
3823 {
3824 if (util_cpu_caps.has_sse) {
3825 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3826 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3827
3828 LLVMBuilderRef builder = gallivm->builder;
3829 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3830 LLVMValueRef mxcsr =
3831 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3832
3833 if (util_cpu_caps.has_daz) {
3834 /* Enable denormals are zero mode */
3835 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3836 }
3837 if (zero) {
3838 mxcsr = LLVMBuildOr(builder, mxcsr,
3839 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3840 } else {
3841 mxcsr = LLVMBuildAnd(builder, mxcsr,
3842 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3843 }
3844
3845 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3846 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3847 }
3848 }
3849
3850 void
3851 lp_build_fpstate_set(struct gallivm_state *gallivm,
3852 LLVMValueRef mxcsr_ptr)
3853 {
3854 if (util_cpu_caps.has_sse) {
3855 LLVMBuilderRef builder = gallivm->builder;
3856 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3857 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3858 lp_build_intrinsic(builder,
3859 "llvm.x86.sse.ldmxcsr",
3860 LLVMVoidTypeInContext(gallivm->context),
3861 &mxcsr_ptr, 1, 0);
3862 }
3863 }