gallivm: replace more complex 3.x version check with LLVM_VERSION_MAJOR/MINOR
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include <llvm/Config/llvm-config.h>
51
52 #include "util/u_memory.h"
53 #include "util/u_debug.h"
54 #include "util/u_math.h"
55 #include "util/u_cpu_detect.h"
56
57 #include "lp_bld_type.h"
58 #include "lp_bld_const.h"
59 #include "lp_bld_init.h"
60 #include "lp_bld_intr.h"
61 #include "lp_bld_logic.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_debug.h"
64 #include "lp_bld_bitarit.h"
65 #include "lp_bld_arit.h"
66 #include "lp_bld_flow.h"
67
68 #if defined(PIPE_ARCH_SSE)
69 #include <xmmintrin.h>
70 #endif
71
72 #ifndef _MM_DENORMALS_ZERO_MASK
73 #define _MM_DENORMALS_ZERO_MASK 0x0040
74 #endif
75
76 #ifndef _MM_FLUSH_ZERO_MASK
77 #define _MM_FLUSH_ZERO_MASK 0x8000
78 #endif
79
80 #define EXP_POLY_DEGREE 5
81
82 #define LOG_POLY_DEGREE 4
83
84
85 /**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91 static LLVMValueRef
92 lp_build_min_simple(struct lp_build_context *bld,
93 LLVMValueRef a,
94 LLVMValueRef b,
95 enum gallivm_nan_behavior nan_behavior)
96 {
97 const struct lp_type type = bld->type;
98 const char *intrinsic = NULL;
99 unsigned intr_size = 0;
100 LLVMValueRef cond;
101
102 assert(lp_check_value(type, a));
103 assert(lp_check_value(type, b));
104
105 /* TODO: optimize the constant case */
106
107 if (type.floating && util_cpu_caps.has_sse) {
108 if (type.width == 32) {
109 if (type.length == 1) {
110 intrinsic = "llvm.x86.sse.min.ss";
111 intr_size = 128;
112 }
113 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
114 intrinsic = "llvm.x86.sse.min.ps";
115 intr_size = 128;
116 }
117 else {
118 intrinsic = "llvm.x86.avx.min.ps.256";
119 intr_size = 256;
120 }
121 }
122 if (type.width == 64 && util_cpu_caps.has_sse2) {
123 if (type.length == 1) {
124 intrinsic = "llvm.x86.sse2.min.sd";
125 intr_size = 128;
126 }
127 else if (type.length == 2 || !util_cpu_caps.has_avx) {
128 intrinsic = "llvm.x86.sse2.min.pd";
129 intr_size = 128;
130 }
131 else {
132 intrinsic = "llvm.x86.avx.min.pd.256";
133 intr_size = 256;
134 }
135 }
136 }
137 else if (type.floating && util_cpu_caps.has_altivec) {
138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
139 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
140 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
141 __FUNCTION__);
142 }
143 if (type.width == 32 && type.length == 4) {
144 intrinsic = "llvm.ppc.altivec.vminfp";
145 intr_size = 128;
146 }
147 } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
148 util_cpu_caps.has_avx2 && type.length > 4) {
149 intr_size = 256;
150 switch (type.width) {
151 case 8:
152 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
153 break;
154 case 16:
155 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
156 break;
157 case 32:
158 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
159 break;
160 }
161 } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
162 util_cpu_caps.has_sse2 && type.length >= 2) {
163 intr_size = 128;
164 if ((type.width == 8 || type.width == 16) &&
165 (type.width * type.length <= 64) &&
166 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
167 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
168 __FUNCTION__);
169 }
170 if (type.width == 8 && !type.sign) {
171 intrinsic = "llvm.x86.sse2.pminu.b";
172 }
173 else if (type.width == 16 && type.sign) {
174 intrinsic = "llvm.x86.sse2.pmins.w";
175 }
176 if (util_cpu_caps.has_sse4_1) {
177 if (type.width == 8 && type.sign) {
178 intrinsic = "llvm.x86.sse41.pminsb";
179 }
180 if (type.width == 16 && !type.sign) {
181 intrinsic = "llvm.x86.sse41.pminuw";
182 }
183 if (type.width == 32 && !type.sign) {
184 intrinsic = "llvm.x86.sse41.pminud";
185 }
186 if (type.width == 32 && type.sign) {
187 intrinsic = "llvm.x86.sse41.pminsd";
188 }
189 }
190 } else if (util_cpu_caps.has_altivec) {
191 intr_size = 128;
192 if (type.width == 8) {
193 if (!type.sign) {
194 intrinsic = "llvm.ppc.altivec.vminub";
195 } else {
196 intrinsic = "llvm.ppc.altivec.vminsb";
197 }
198 } else if (type.width == 16) {
199 if (!type.sign) {
200 intrinsic = "llvm.ppc.altivec.vminuh";
201 } else {
202 intrinsic = "llvm.ppc.altivec.vminsh";
203 }
204 } else if (type.width == 32) {
205 if (!type.sign) {
206 intrinsic = "llvm.ppc.altivec.vminuw";
207 } else {
208 intrinsic = "llvm.ppc.altivec.vminsw";
209 }
210 }
211 }
212
213 if (intrinsic) {
214 /* We need to handle nan's for floating point numbers. If one of the
215 * inputs is nan the other should be returned (required by both D3D10+
216 * and OpenCL).
217 * The sse intrinsics return the second operator in case of nan by
218 * default so we need to special code to handle those.
219 */
220 if (util_cpu_caps.has_sse && type.floating &&
221 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
222 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
223 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
224 LLVMValueRef isnan, min;
225 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
226 type,
227 intr_size, a, b);
228 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
229 isnan = lp_build_isnan(bld, b);
230 return lp_build_select(bld, isnan, a, min);
231 } else {
232 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
233 isnan = lp_build_isnan(bld, a);
234 return lp_build_select(bld, isnan, a, min);
235 }
236 } else {
237 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
238 type,
239 intr_size, a, b);
240 }
241 }
242
243 if (type.floating) {
244 switch (nan_behavior) {
245 case GALLIVM_NAN_RETURN_NAN: {
246 LLVMValueRef isnan = lp_build_isnan(bld, b);
247 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
248 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
249 return lp_build_select(bld, cond, a, b);
250 }
251 break;
252 case GALLIVM_NAN_RETURN_OTHER: {
253 LLVMValueRef isnan = lp_build_isnan(bld, a);
254 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
255 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
256 return lp_build_select(bld, cond, a, b);
257 }
258 break;
259 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
260 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
261 return lp_build_select(bld, cond, a, b);
262 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
263 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
264 return lp_build_select(bld, cond, b, a);
265 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
266 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
267 return lp_build_select(bld, cond, a, b);
268 break;
269 default:
270 assert(0);
271 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
272 return lp_build_select(bld, cond, a, b);
273 }
274 } else {
275 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
276 return lp_build_select(bld, cond, a, b);
277 }
278 }
279
280
281 LLVMValueRef
282 lp_build_fmuladd(LLVMBuilderRef builder,
283 LLVMValueRef a,
284 LLVMValueRef b,
285 LLVMValueRef c)
286 {
287 LLVMTypeRef type = LLVMTypeOf(a);
288 assert(type == LLVMTypeOf(b));
289 assert(type == LLVMTypeOf(c));
290 if (LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 4)) {
291 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
292 * not supported, and instead it falls-back to a C function.
293 */
294 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
295 }
296 char intrinsic[32];
297 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
298 LLVMValueRef args[] = { a, b, c };
299 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
300 }
301
302
303 /**
304 * Generate max(a, b)
305 * No checks for special case values of a or b = 1 or 0 are done.
306 * NaN's are handled according to the behavior specified by the
307 * nan_behavior argument.
308 */
309 static LLVMValueRef
310 lp_build_max_simple(struct lp_build_context *bld,
311 LLVMValueRef a,
312 LLVMValueRef b,
313 enum gallivm_nan_behavior nan_behavior)
314 {
315 const struct lp_type type = bld->type;
316 const char *intrinsic = NULL;
317 unsigned intr_size = 0;
318 LLVMValueRef cond;
319
320 assert(lp_check_value(type, a));
321 assert(lp_check_value(type, b));
322
323 /* TODO: optimize the constant case */
324
325 if (type.floating && util_cpu_caps.has_sse) {
326 if (type.width == 32) {
327 if (type.length == 1) {
328 intrinsic = "llvm.x86.sse.max.ss";
329 intr_size = 128;
330 }
331 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
332 intrinsic = "llvm.x86.sse.max.ps";
333 intr_size = 128;
334 }
335 else {
336 intrinsic = "llvm.x86.avx.max.ps.256";
337 intr_size = 256;
338 }
339 }
340 if (type.width == 64 && util_cpu_caps.has_sse2) {
341 if (type.length == 1) {
342 intrinsic = "llvm.x86.sse2.max.sd";
343 intr_size = 128;
344 }
345 else if (type.length == 2 || !util_cpu_caps.has_avx) {
346 intrinsic = "llvm.x86.sse2.max.pd";
347 intr_size = 128;
348 }
349 else {
350 intrinsic = "llvm.x86.avx.max.pd.256";
351 intr_size = 256;
352 }
353 }
354 }
355 else if (type.floating && util_cpu_caps.has_altivec) {
356 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
357 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
358 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
359 __FUNCTION__);
360 }
361 if (type.width == 32 || type.length == 4) {
362 intrinsic = "llvm.ppc.altivec.vmaxfp";
363 intr_size = 128;
364 }
365 } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
366 util_cpu_caps.has_avx2 && type.length > 4) {
367 intr_size = 256;
368 switch (type.width) {
369 case 8:
370 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
371 break;
372 case 16:
373 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
374 break;
375 case 32:
376 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
377 break;
378 }
379 } else if ((LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9)) &&
380 util_cpu_caps.has_sse2 && type.length >= 2) {
381 intr_size = 128;
382 if ((type.width == 8 || type.width == 16) &&
383 (type.width * type.length <= 64) &&
384 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
385 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
386 __FUNCTION__);
387 }
388 if (type.width == 8 && !type.sign) {
389 intrinsic = "llvm.x86.sse2.pmaxu.b";
390 intr_size = 128;
391 }
392 else if (type.width == 16 && type.sign) {
393 intrinsic = "llvm.x86.sse2.pmaxs.w";
394 }
395 if (util_cpu_caps.has_sse4_1) {
396 if (type.width == 8 && type.sign) {
397 intrinsic = "llvm.x86.sse41.pmaxsb";
398 }
399 if (type.width == 16 && !type.sign) {
400 intrinsic = "llvm.x86.sse41.pmaxuw";
401 }
402 if (type.width == 32 && !type.sign) {
403 intrinsic = "llvm.x86.sse41.pmaxud";
404 }
405 if (type.width == 32 && type.sign) {
406 intrinsic = "llvm.x86.sse41.pmaxsd";
407 }
408 }
409 } else if (util_cpu_caps.has_altivec) {
410 intr_size = 128;
411 if (type.width == 8) {
412 if (!type.sign) {
413 intrinsic = "llvm.ppc.altivec.vmaxub";
414 } else {
415 intrinsic = "llvm.ppc.altivec.vmaxsb";
416 }
417 } else if (type.width == 16) {
418 if (!type.sign) {
419 intrinsic = "llvm.ppc.altivec.vmaxuh";
420 } else {
421 intrinsic = "llvm.ppc.altivec.vmaxsh";
422 }
423 } else if (type.width == 32) {
424 if (!type.sign) {
425 intrinsic = "llvm.ppc.altivec.vmaxuw";
426 } else {
427 intrinsic = "llvm.ppc.altivec.vmaxsw";
428 }
429 }
430 }
431
432 if (intrinsic) {
433 if (util_cpu_caps.has_sse && type.floating &&
434 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
435 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
436 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
437 LLVMValueRef isnan, max;
438 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
439 type,
440 intr_size, a, b);
441 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
442 isnan = lp_build_isnan(bld, b);
443 return lp_build_select(bld, isnan, a, max);
444 } else {
445 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
446 isnan = lp_build_isnan(bld, a);
447 return lp_build_select(bld, isnan, a, max);
448 }
449 } else {
450 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
451 type,
452 intr_size, a, b);
453 }
454 }
455
456 if (type.floating) {
457 switch (nan_behavior) {
458 case GALLIVM_NAN_RETURN_NAN: {
459 LLVMValueRef isnan = lp_build_isnan(bld, b);
460 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
461 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
462 return lp_build_select(bld, cond, a, b);
463 }
464 break;
465 case GALLIVM_NAN_RETURN_OTHER: {
466 LLVMValueRef isnan = lp_build_isnan(bld, a);
467 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
468 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
469 return lp_build_select(bld, cond, a, b);
470 }
471 break;
472 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
473 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
474 return lp_build_select(bld, cond, a, b);
475 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
476 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
477 return lp_build_select(bld, cond, b, a);
478 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
479 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
480 return lp_build_select(bld, cond, a, b);
481 break;
482 default:
483 assert(0);
484 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
485 return lp_build_select(bld, cond, a, b);
486 }
487 } else {
488 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
489 return lp_build_select(bld, cond, a, b);
490 }
491 }
492
493
494 /**
495 * Generate 1 - a, or ~a depending on bld->type.
496 */
497 LLVMValueRef
498 lp_build_comp(struct lp_build_context *bld,
499 LLVMValueRef a)
500 {
501 LLVMBuilderRef builder = bld->gallivm->builder;
502 const struct lp_type type = bld->type;
503
504 assert(lp_check_value(type, a));
505
506 if(a == bld->one)
507 return bld->zero;
508 if(a == bld->zero)
509 return bld->one;
510
511 if(type.norm && !type.floating && !type.fixed && !type.sign) {
512 if(LLVMIsConstant(a))
513 return LLVMConstNot(a);
514 else
515 return LLVMBuildNot(builder, a, "");
516 }
517
518 if(LLVMIsConstant(a))
519 if (type.floating)
520 return LLVMConstFSub(bld->one, a);
521 else
522 return LLVMConstSub(bld->one, a);
523 else
524 if (type.floating)
525 return LLVMBuildFSub(builder, bld->one, a, "");
526 else
527 return LLVMBuildSub(builder, bld->one, a, "");
528 }
529
530
531 /**
532 * Generate a + b
533 */
534 LLVMValueRef
535 lp_build_add(struct lp_build_context *bld,
536 LLVMValueRef a,
537 LLVMValueRef b)
538 {
539 LLVMBuilderRef builder = bld->gallivm->builder;
540 const struct lp_type type = bld->type;
541 LLVMValueRef res;
542
543 assert(lp_check_value(type, a));
544 assert(lp_check_value(type, b));
545
546 if (a == bld->zero)
547 return b;
548 if (b == bld->zero)
549 return a;
550 if (a == bld->undef || b == bld->undef)
551 return bld->undef;
552
553 if (type.norm) {
554 const char *intrinsic = NULL;
555
556 if (!type.sign && (a == bld->one || b == bld->one))
557 return bld->one;
558
559 if (!type.floating && !type.fixed) {
560 if (LLVM_VERSION_MAJOR >= 9) {
561 char intrin[32];
562 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
563 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
564 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
565 }
566 if (type.width * type.length == 128) {
567 if (util_cpu_caps.has_sse2) {
568 if (type.width == 8)
569 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
570 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.b" : NULL;
571 if (type.width == 16)
572 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
573 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.paddus.w" : NULL;
574 } else if (util_cpu_caps.has_altivec) {
575 if (type.width == 8)
576 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
577 if (type.width == 16)
578 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
579 }
580 }
581 if (type.width * type.length == 256) {
582 if (util_cpu_caps.has_avx2) {
583 if (type.width == 8)
584 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
585 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.b" : NULL;
586 if (type.width == 16)
587 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
588 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.paddus.w" : NULL;
589 }
590 }
591 }
592
593 if (intrinsic)
594 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
595 }
596
597 if(type.norm && !type.floating && !type.fixed) {
598 if (type.sign) {
599 uint64_t sign = (uint64_t)1 << (type.width - 1);
600 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
601 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
602 /* a_clamp_max is the maximum a for positive b,
603 a_clamp_min is the minimum a for negative b. */
604 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
605 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
606 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
607 }
608 }
609
610 if(LLVMIsConstant(a) && LLVMIsConstant(b))
611 if (type.floating)
612 res = LLVMConstFAdd(a, b);
613 else
614 res = LLVMConstAdd(a, b);
615 else
616 if (type.floating)
617 res = LLVMBuildFAdd(builder, a, b, "");
618 else
619 res = LLVMBuildAdd(builder, a, b, "");
620
621 /* clamp to ceiling of 1.0 */
622 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
623 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
624
625 if (type.norm && !type.floating && !type.fixed) {
626 if (!type.sign) {
627 /*
628 * newer llvm versions no longer support the intrinsics, but recognize
629 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
630 * code, it is important we match the pattern llvm uses (and pray llvm
631 * doesn't change it - and hope they decide on the same pattern for
632 * all backends supporting it...).
633 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
634 * interfere with llvm's ability to recognize the pattern but seems
635 * a bit brittle.
636 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
637 */
638 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
639 res = lp_build_select(bld, overflowed,
640 LLVMConstAllOnes(bld->int_vec_type), res);
641 }
642 }
643
644 /* XXX clamp to floor of -1 or 0??? */
645
646 return res;
647 }
648
649
650 /** Return the scalar sum of the elements of a.
651 * Should avoid this operation whenever possible.
652 */
653 LLVMValueRef
654 lp_build_horizontal_add(struct lp_build_context *bld,
655 LLVMValueRef a)
656 {
657 LLVMBuilderRef builder = bld->gallivm->builder;
658 const struct lp_type type = bld->type;
659 LLVMValueRef index, res;
660 unsigned i, length;
661 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
662 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
663 LLVMValueRef vecres, elem2;
664
665 assert(lp_check_value(type, a));
666
667 if (type.length == 1) {
668 return a;
669 }
670
671 assert(!bld->type.norm);
672
673 /*
674 * for byte vectors can do much better with psadbw.
675 * Using repeated shuffle/adds here. Note with multiple vectors
676 * this can be done more efficiently as outlined in the intel
677 * optimization manual.
678 * Note: could cause data rearrangement if used with smaller element
679 * sizes.
680 */
681
682 vecres = a;
683 length = type.length / 2;
684 while (length > 1) {
685 LLVMValueRef vec1, vec2;
686 for (i = 0; i < length; i++) {
687 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
688 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
689 }
690 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
691 LLVMConstVector(shuffles1, length), "");
692 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
693 LLVMConstVector(shuffles2, length), "");
694 if (type.floating) {
695 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
696 }
697 else {
698 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
699 }
700 length = length >> 1;
701 }
702
703 /* always have vector of size 2 here */
704 assert(length == 1);
705
706 index = lp_build_const_int32(bld->gallivm, 0);
707 res = LLVMBuildExtractElement(builder, vecres, index, "");
708 index = lp_build_const_int32(bld->gallivm, 1);
709 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
710
711 if (type.floating)
712 res = LLVMBuildFAdd(builder, res, elem2, "");
713 else
714 res = LLVMBuildAdd(builder, res, elem2, "");
715
716 return res;
717 }
718
719 /**
720 * Return the horizontal sums of 4 float vectors as a float4 vector.
721 * This uses the technique as outlined in Intel Optimization Manual.
722 */
723 static LLVMValueRef
724 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
725 LLVMValueRef src[4])
726 {
727 struct gallivm_state *gallivm = bld->gallivm;
728 LLVMBuilderRef builder = gallivm->builder;
729 LLVMValueRef shuffles[4];
730 LLVMValueRef tmp[4];
731 LLVMValueRef sumtmp[2], shuftmp[2];
732
733 /* lower half of regs */
734 shuffles[0] = lp_build_const_int32(gallivm, 0);
735 shuffles[1] = lp_build_const_int32(gallivm, 1);
736 shuffles[2] = lp_build_const_int32(gallivm, 4);
737 shuffles[3] = lp_build_const_int32(gallivm, 5);
738 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
739 LLVMConstVector(shuffles, 4), "");
740 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
741 LLVMConstVector(shuffles, 4), "");
742
743 /* upper half of regs */
744 shuffles[0] = lp_build_const_int32(gallivm, 2);
745 shuffles[1] = lp_build_const_int32(gallivm, 3);
746 shuffles[2] = lp_build_const_int32(gallivm, 6);
747 shuffles[3] = lp_build_const_int32(gallivm, 7);
748 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
749 LLVMConstVector(shuffles, 4), "");
750 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
751 LLVMConstVector(shuffles, 4), "");
752
753 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
754 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
755
756 shuffles[0] = lp_build_const_int32(gallivm, 0);
757 shuffles[1] = lp_build_const_int32(gallivm, 2);
758 shuffles[2] = lp_build_const_int32(gallivm, 4);
759 shuffles[3] = lp_build_const_int32(gallivm, 6);
760 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
761 LLVMConstVector(shuffles, 4), "");
762
763 shuffles[0] = lp_build_const_int32(gallivm, 1);
764 shuffles[1] = lp_build_const_int32(gallivm, 3);
765 shuffles[2] = lp_build_const_int32(gallivm, 5);
766 shuffles[3] = lp_build_const_int32(gallivm, 7);
767 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
768 LLVMConstVector(shuffles, 4), "");
769
770 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
771 }
772
773
774 /*
775 * partially horizontally add 2-4 float vectors with length nx4,
776 * i.e. only four adjacent values in each vector will be added,
777 * assuming values are really grouped in 4 which also determines
778 * output order.
779 *
780 * Return a vector of the same length as the initial vectors,
781 * with the excess elements (if any) being undefined.
782 * The element order is independent of number of input vectors.
783 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
784 * the output order thus will be
785 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
786 */
787 LLVMValueRef
788 lp_build_hadd_partial4(struct lp_build_context *bld,
789 LLVMValueRef vectors[],
790 unsigned num_vecs)
791 {
792 struct gallivm_state *gallivm = bld->gallivm;
793 LLVMBuilderRef builder = gallivm->builder;
794 LLVMValueRef ret_vec;
795 LLVMValueRef tmp[4];
796 const char *intrinsic = NULL;
797
798 assert(num_vecs >= 2 && num_vecs <= 4);
799 assert(bld->type.floating);
800
801 /* only use this with at least 2 vectors, as it is sort of expensive
802 * (depending on cpu) and we always need two horizontal adds anyway,
803 * so a shuffle/add approach might be better.
804 */
805
806 tmp[0] = vectors[0];
807 tmp[1] = vectors[1];
808
809 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
810 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
811
812 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
813 bld->type.length == 4) {
814 intrinsic = "llvm.x86.sse3.hadd.ps";
815 }
816 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
817 bld->type.length == 8) {
818 intrinsic = "llvm.x86.avx.hadd.ps.256";
819 }
820 if (intrinsic) {
821 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
822 lp_build_vec_type(gallivm, bld->type),
823 tmp[0], tmp[1]);
824 if (num_vecs > 2) {
825 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
826 lp_build_vec_type(gallivm, bld->type),
827 tmp[2], tmp[3]);
828 }
829 else {
830 tmp[1] = tmp[0];
831 }
832 return lp_build_intrinsic_binary(builder, intrinsic,
833 lp_build_vec_type(gallivm, bld->type),
834 tmp[0], tmp[1]);
835 }
836
837 if (bld->type.length == 4) {
838 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
839 }
840 else {
841 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
842 unsigned j;
843 unsigned num_iter = bld->type.length / 4;
844 struct lp_type parttype = bld->type;
845 parttype.length = 4;
846 for (j = 0; j < num_iter; j++) {
847 LLVMValueRef partsrc[4];
848 unsigned i;
849 for (i = 0; i < 4; i++) {
850 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
851 }
852 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
853 }
854 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
855 }
856 return ret_vec;
857 }
858
859 /**
860 * Generate a - b
861 */
862 LLVMValueRef
863 lp_build_sub(struct lp_build_context *bld,
864 LLVMValueRef a,
865 LLVMValueRef b)
866 {
867 LLVMBuilderRef builder = bld->gallivm->builder;
868 const struct lp_type type = bld->type;
869 LLVMValueRef res;
870
871 assert(lp_check_value(type, a));
872 assert(lp_check_value(type, b));
873
874 if (b == bld->zero)
875 return a;
876 if (a == bld->undef || b == bld->undef)
877 return bld->undef;
878 if (a == b)
879 return bld->zero;
880
881 if (type.norm) {
882 const char *intrinsic = NULL;
883
884 if (!type.sign && b == bld->one)
885 return bld->zero;
886
887 if (!type.floating && !type.fixed) {
888 if (LLVM_VERSION_MAJOR >= 9) {
889 char intrin[32];
890 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
891 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
892 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
893 }
894 if (type.width * type.length == 128) {
895 if (util_cpu_caps.has_sse2) {
896 if (type.width == 8)
897 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
898 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.b" : NULL;
899 if (type.width == 16)
900 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
901 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.sse2.psubus.w" : NULL;
902 } else if (util_cpu_caps.has_altivec) {
903 if (type.width == 8)
904 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
905 if (type.width == 16)
906 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
907 }
908 }
909 if (type.width * type.length == 256) {
910 if (util_cpu_caps.has_avx2) {
911 if (type.width == 8)
912 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
913 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.b" : NULL;
914 if (type.width == 16)
915 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
916 LLVM_VERSION_MAJOR < 8 ? "llvm.x86.avx2.psubus.w" : NULL;
917 }
918 }
919 }
920
921 if (intrinsic)
922 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
923 }
924
925 if(type.norm && !type.floating && !type.fixed) {
926 if (type.sign) {
927 uint64_t sign = (uint64_t)1 << (type.width - 1);
928 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
929 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
930 /* a_clamp_max is the maximum a for negative b,
931 a_clamp_min is the minimum a for positive b. */
932 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
933 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
934 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
935 } else {
936 /*
937 * This must match llvm pattern for saturated unsigned sub.
938 * (lp_build_max_simple actually does the job with its current
939 * definition but do it explicitly here.)
940 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
941 * interfere with llvm's ability to recognize the pattern but seems
942 * a bit brittle.
943 * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
944 */
945 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
946 a = lp_build_select(bld, no_ov, a, b);
947 }
948 }
949
950 if(LLVMIsConstant(a) && LLVMIsConstant(b))
951 if (type.floating)
952 res = LLVMConstFSub(a, b);
953 else
954 res = LLVMConstSub(a, b);
955 else
956 if (type.floating)
957 res = LLVMBuildFSub(builder, a, b, "");
958 else
959 res = LLVMBuildSub(builder, a, b, "");
960
961 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
962 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
963
964 return res;
965 }
966
967
968
969 /**
970 * Normalized multiplication.
971 *
972 * There are several approaches for (using 8-bit normalized multiplication as
973 * an example):
974 *
975 * - alpha plus one
976 *
977 * makes the following approximation to the division (Sree)
978 *
979 * a*b/255 ~= (a*(b + 1)) >> 256
980 *
981 * which is the fastest method that satisfies the following OpenGL criteria of
982 *
983 * 0*0 = 0 and 255*255 = 255
984 *
985 * - geometric series
986 *
987 * takes the geometric series approximation to the division
988 *
989 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
990 *
991 * in this case just the first two terms to fit in 16bit arithmetic
992 *
993 * t/255 ~= (t + (t >> 8)) >> 8
994 *
995 * note that just by itself it doesn't satisfies the OpenGL criteria, as
996 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
997 * must be used.
998 *
999 * - geometric series plus rounding
1000 *
1001 * when using a geometric series division instead of truncating the result
1002 * use roundoff in the approximation (Jim Blinn)
1003 *
1004 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
1005 *
1006 * achieving the exact results.
1007 *
1008 *
1009 *
1010 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1011 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1012 * @sa Michael Herf, The "double blend trick", May 2000,
1013 * http://www.stereopsis.com/doubleblend.html
1014 */
1015 LLVMValueRef
1016 lp_build_mul_norm(struct gallivm_state *gallivm,
1017 struct lp_type wide_type,
1018 LLVMValueRef a, LLVMValueRef b)
1019 {
1020 LLVMBuilderRef builder = gallivm->builder;
1021 struct lp_build_context bld;
1022 unsigned n;
1023 LLVMValueRef half;
1024 LLVMValueRef ab;
1025
1026 assert(!wide_type.floating);
1027 assert(lp_check_value(wide_type, a));
1028 assert(lp_check_value(wide_type, b));
1029
1030 lp_build_context_init(&bld, gallivm, wide_type);
1031
1032 n = wide_type.width / 2;
1033 if (wide_type.sign) {
1034 --n;
1035 }
1036
1037 /*
1038 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1039 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1040 */
1041
1042 /*
1043 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1044 */
1045
1046 ab = LLVMBuildMul(builder, a, b, "");
1047 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1048
1049 /*
1050 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1051 */
1052
1053 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1054 if (wide_type.sign) {
1055 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1056 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1057 half = lp_build_select(&bld, sign, minus_half, half);
1058 }
1059 ab = LLVMBuildAdd(builder, ab, half, "");
1060
1061 /* Final division */
1062 ab = lp_build_shr_imm(&bld, ab, n);
1063
1064 return ab;
1065 }
1066
1067 /**
1068 * Generate a * b
1069 */
1070 LLVMValueRef
1071 lp_build_mul(struct lp_build_context *bld,
1072 LLVMValueRef a,
1073 LLVMValueRef b)
1074 {
1075 LLVMBuilderRef builder = bld->gallivm->builder;
1076 const struct lp_type type = bld->type;
1077 LLVMValueRef shift;
1078 LLVMValueRef res;
1079
1080 assert(lp_check_value(type, a));
1081 assert(lp_check_value(type, b));
1082
1083 if(a == bld->zero)
1084 return bld->zero;
1085 if(a == bld->one)
1086 return b;
1087 if(b == bld->zero)
1088 return bld->zero;
1089 if(b == bld->one)
1090 return a;
1091 if(a == bld->undef || b == bld->undef)
1092 return bld->undef;
1093
1094 if (!type.floating && !type.fixed && type.norm) {
1095 struct lp_type wide_type = lp_wider_type(type);
1096 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1097
1098 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1099 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1100
1101 /* PMULLW, PSRLW, PADDW */
1102 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1103 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1104
1105 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1106
1107 return ab;
1108 }
1109
1110 if(type.fixed)
1111 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1112 else
1113 shift = NULL;
1114
1115 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1116 if (type.floating)
1117 res = LLVMConstFMul(a, b);
1118 else
1119 res = LLVMConstMul(a, b);
1120 if(shift) {
1121 if(type.sign)
1122 res = LLVMConstAShr(res, shift);
1123 else
1124 res = LLVMConstLShr(res, shift);
1125 }
1126 }
1127 else {
1128 if (type.floating)
1129 res = LLVMBuildFMul(builder, a, b, "");
1130 else
1131 res = LLVMBuildMul(builder, a, b, "");
1132 if(shift) {
1133 if(type.sign)
1134 res = LLVMBuildAShr(builder, res, shift, "");
1135 else
1136 res = LLVMBuildLShr(builder, res, shift, "");
1137 }
1138 }
1139
1140 return res;
1141 }
1142
1143 /*
1144 * Widening mul, valid for 32x32 bit -> 64bit only.
1145 * Result is low 32bits, high bits returned in res_hi.
1146 *
1147 * Emits code that is meant to be compiled for the host CPU.
1148 */
1149 LLVMValueRef
1150 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1151 LLVMValueRef a,
1152 LLVMValueRef b,
1153 LLVMValueRef *res_hi)
1154 {
1155 struct gallivm_state *gallivm = bld->gallivm;
1156 LLVMBuilderRef builder = gallivm->builder;
1157
1158 assert(bld->type.width == 32);
1159 assert(bld->type.floating == 0);
1160 assert(bld->type.fixed == 0);
1161 assert(bld->type.norm == 0);
1162
1163 /*
1164 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1165 * for x86 simd is atrocious (even if the high bits weren't required),
1166 * trying to handle real 64bit inputs (which of course can't happen due
1167 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1168 * apparently llvm does not recognize this widening mul). This includes 6
1169 * (instead of 2) pmuludq plus extra adds and shifts
1170 * The same story applies to signed mul, albeit fixing this requires sse41.
1171 * https://llvm.org/bugs/show_bug.cgi?id=30845
1172 * So, whip up our own code, albeit only for length 4 and 8 (which
1173 * should be good enough)...
1174 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1175 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1176 * for signed), which the fallback code does not, without this llvm
1177 * will likely still produce atrocious code.
1178 */
1179 if (LLVM_VERSION_MAJOR < 7 &&
1180 (bld->type.length == 4 || bld->type.length == 8) &&
1181 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1182 util_cpu_caps.has_sse4_1)) {
1183 const char *intrinsic = NULL;
1184 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1185 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1186 struct lp_type type_wide = lp_wider_type(bld->type);
1187 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1188 unsigned i;
1189 for (i = 0; i < bld->type.length; i += 2) {
1190 shuf[i] = lp_build_const_int32(gallivm, i+1);
1191 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1192 }
1193 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1194 aeven = a;
1195 beven = b;
1196 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1197 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1198
1199 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1200 if (bld->type.sign) {
1201 intrinsic = "llvm.x86.avx2.pmul.dq";
1202 } else {
1203 intrinsic = "llvm.x86.avx2.pmulu.dq";
1204 }
1205 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1206 wider_type, aeven, beven);
1207 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1208 wider_type, aodd, bodd);
1209 }
1210 else {
1211 /* for consistent naming look elsewhere... */
1212 if (bld->type.sign) {
1213 intrinsic = "llvm.x86.sse41.pmuldq";
1214 } else {
1215 intrinsic = "llvm.x86.sse2.pmulu.dq";
1216 }
1217 /*
1218 * XXX If we only have AVX but not AVX2 this is a pain.
1219 * lp_build_intrinsic_binary_anylength() can't handle it
1220 * (due to src and dst type not being identical).
1221 */
1222 if (bld->type.length == 8) {
1223 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1224 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1225 LLVMValueRef muleven2[2], mulodd2[2];
1226 struct lp_type type_wide_half = type_wide;
1227 LLVMTypeRef wtype_half;
1228 type_wide_half.length = 2;
1229 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1230 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1231 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1232 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1233 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1234 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1235 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1236 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1237 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1238 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1239 wtype_half, aevenlo, bevenlo);
1240 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1241 wtype_half, aoddlo, boddlo);
1242 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1243 wtype_half, aevenhi, bevenhi);
1244 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1245 wtype_half, aoddhi, boddhi);
1246 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1247 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1248
1249 }
1250 else {
1251 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1252 wider_type, aeven, beven);
1253 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1254 wider_type, aodd, bodd);
1255 }
1256 }
1257 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1258 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1259
1260 for (i = 0; i < bld->type.length; i += 2) {
1261 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1262 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1263 }
1264 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1265 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1266
1267 for (i = 0; i < bld->type.length; i += 2) {
1268 shuf[i] = lp_build_const_int32(gallivm, i);
1269 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1270 }
1271 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1272 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1273 }
1274 else {
1275 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1276 }
1277 }
1278
1279
1280 /*
1281 * Widening mul, valid for 32x32 bit -> 64bit only.
1282 * Result is low 32bits, high bits returned in res_hi.
1283 *
1284 * Emits generic code.
1285 */
1286 LLVMValueRef
1287 lp_build_mul_32_lohi(struct lp_build_context *bld,
1288 LLVMValueRef a,
1289 LLVMValueRef b,
1290 LLVMValueRef *res_hi)
1291 {
1292 struct gallivm_state *gallivm = bld->gallivm;
1293 LLVMBuilderRef builder = gallivm->builder;
1294 LLVMValueRef tmp, shift, res_lo;
1295 struct lp_type type_tmp;
1296 LLVMTypeRef wide_type, narrow_type;
1297
1298 type_tmp = bld->type;
1299 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1300 type_tmp.width *= 2;
1301 wide_type = lp_build_vec_type(gallivm, type_tmp);
1302 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1303
1304 if (bld->type.sign) {
1305 a = LLVMBuildSExt(builder, a, wide_type, "");
1306 b = LLVMBuildSExt(builder, b, wide_type, "");
1307 } else {
1308 a = LLVMBuildZExt(builder, a, wide_type, "");
1309 b = LLVMBuildZExt(builder, b, wide_type, "");
1310 }
1311 tmp = LLVMBuildMul(builder, a, b, "");
1312
1313 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1314
1315 /* Since we truncate anyway, LShr and AShr are equivalent. */
1316 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1317 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1318
1319 return res_lo;
1320 }
1321
1322
1323 /* a * b + c */
1324 LLVMValueRef
1325 lp_build_mad(struct lp_build_context *bld,
1326 LLVMValueRef a,
1327 LLVMValueRef b,
1328 LLVMValueRef c)
1329 {
1330 const struct lp_type type = bld->type;
1331 if (type.floating) {
1332 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1333 } else {
1334 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1335 }
1336 }
1337
1338
1339 /**
1340 * Small vector x scale multiplication optimization.
1341 */
1342 LLVMValueRef
1343 lp_build_mul_imm(struct lp_build_context *bld,
1344 LLVMValueRef a,
1345 int b)
1346 {
1347 LLVMBuilderRef builder = bld->gallivm->builder;
1348 LLVMValueRef factor;
1349
1350 assert(lp_check_value(bld->type, a));
1351
1352 if(b == 0)
1353 return bld->zero;
1354
1355 if(b == 1)
1356 return a;
1357
1358 if(b == -1)
1359 return lp_build_negate(bld, a);
1360
1361 if(b == 2 && bld->type.floating)
1362 return lp_build_add(bld, a, a);
1363
1364 if(util_is_power_of_two_or_zero(b)) {
1365 unsigned shift = ffs(b) - 1;
1366
1367 if(bld->type.floating) {
1368 #if 0
1369 /*
1370 * Power of two multiplication by directly manipulating the exponent.
1371 *
1372 * XXX: This might not be always faster, it will introduce a small error
1373 * for multiplication by zero, and it will produce wrong results
1374 * for Inf and NaN.
1375 */
1376 unsigned mantissa = lp_mantissa(bld->type);
1377 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1378 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1379 a = LLVMBuildAdd(builder, a, factor, "");
1380 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1381 return a;
1382 #endif
1383 }
1384 else {
1385 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1386 return LLVMBuildShl(builder, a, factor, "");
1387 }
1388 }
1389
1390 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1391 return lp_build_mul(bld, a, factor);
1392 }
1393
1394
1395 /**
1396 * Generate a / b
1397 */
1398 LLVMValueRef
1399 lp_build_div(struct lp_build_context *bld,
1400 LLVMValueRef a,
1401 LLVMValueRef b)
1402 {
1403 LLVMBuilderRef builder = bld->gallivm->builder;
1404 const struct lp_type type = bld->type;
1405
1406 assert(lp_check_value(type, a));
1407 assert(lp_check_value(type, b));
1408
1409 if(a == bld->zero)
1410 return bld->zero;
1411 if(a == bld->one && type.floating)
1412 return lp_build_rcp(bld, b);
1413 if(b == bld->zero)
1414 return bld->undef;
1415 if(b == bld->one)
1416 return a;
1417 if(a == bld->undef || b == bld->undef)
1418 return bld->undef;
1419
1420 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1421 if (type.floating)
1422 return LLVMConstFDiv(a, b);
1423 else if (type.sign)
1424 return LLVMConstSDiv(a, b);
1425 else
1426 return LLVMConstUDiv(a, b);
1427 }
1428
1429 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1430 if(FALSE &&
1431 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1432 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1433 type.floating)
1434 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1435
1436 if (type.floating)
1437 return LLVMBuildFDiv(builder, a, b, "");
1438 else if (type.sign)
1439 return LLVMBuildSDiv(builder, a, b, "");
1440 else
1441 return LLVMBuildUDiv(builder, a, b, "");
1442 }
1443
1444
1445 /**
1446 * Linear interpolation helper.
1447 *
1448 * @param normalized whether we are interpolating normalized values,
1449 * encoded in normalized integers, twice as wide.
1450 *
1451 * @sa http://www.stereopsis.com/doubleblend.html
1452 */
1453 static inline LLVMValueRef
1454 lp_build_lerp_simple(struct lp_build_context *bld,
1455 LLVMValueRef x,
1456 LLVMValueRef v0,
1457 LLVMValueRef v1,
1458 unsigned flags)
1459 {
1460 unsigned half_width = bld->type.width/2;
1461 LLVMBuilderRef builder = bld->gallivm->builder;
1462 LLVMValueRef delta;
1463 LLVMValueRef res;
1464
1465 assert(lp_check_value(bld->type, x));
1466 assert(lp_check_value(bld->type, v0));
1467 assert(lp_check_value(bld->type, v1));
1468
1469 delta = lp_build_sub(bld, v1, v0);
1470
1471 if (bld->type.floating) {
1472 assert(flags == 0);
1473 return lp_build_mad(bld, x, delta, v0);
1474 }
1475
1476 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1477 if (!bld->type.sign) {
1478 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1479 /*
1480 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1481 * most-significant-bit to the lowest-significant-bit, so that
1482 * later we can just divide by 2**n instead of 2**n - 1.
1483 */
1484
1485 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1486 }
1487
1488 /* (x * delta) >> n */
1489 res = lp_build_mul(bld, x, delta);
1490 res = lp_build_shr_imm(bld, res, half_width);
1491 } else {
1492 /*
1493 * The rescaling trick above doesn't work for signed numbers, so
1494 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1495 * instead.
1496 */
1497 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1498 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1499 }
1500 } else {
1501 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1502 res = lp_build_mul(bld, x, delta);
1503 }
1504
1505 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1506 /*
1507 * At this point both res and v0 only use the lower half of the bits,
1508 * the rest is zero. Instead of add / mask, do add with half wide type.
1509 */
1510 struct lp_type narrow_type;
1511 struct lp_build_context narrow_bld;
1512
1513 memset(&narrow_type, 0, sizeof narrow_type);
1514 narrow_type.sign = bld->type.sign;
1515 narrow_type.width = bld->type.width/2;
1516 narrow_type.length = bld->type.length*2;
1517
1518 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1519 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1520 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1521 res = lp_build_add(&narrow_bld, v0, res);
1522 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1523 } else {
1524 res = lp_build_add(bld, v0, res);
1525
1526 if (bld->type.fixed) {
1527 /*
1528 * We need to mask out the high order bits when lerping 8bit
1529 * normalized colors stored on 16bits
1530 */
1531 /* XXX: This step is necessary for lerping 8bit colors stored on
1532 * 16bits, but it will be wrong for true fixed point use cases.
1533 * Basically we need a more powerful lp_type, capable of further
1534 * distinguishing the values interpretation from the value storage.
1535 */
1536 LLVMValueRef low_bits;
1537 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1538 res = LLVMBuildAnd(builder, res, low_bits, "");
1539 }
1540 }
1541
1542 return res;
1543 }
1544
1545
1546 /**
1547 * Linear interpolation.
1548 */
1549 LLVMValueRef
1550 lp_build_lerp(struct lp_build_context *bld,
1551 LLVMValueRef x,
1552 LLVMValueRef v0,
1553 LLVMValueRef v1,
1554 unsigned flags)
1555 {
1556 const struct lp_type type = bld->type;
1557 LLVMValueRef res;
1558
1559 assert(lp_check_value(type, x));
1560 assert(lp_check_value(type, v0));
1561 assert(lp_check_value(type, v1));
1562
1563 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1564
1565 if (type.norm) {
1566 struct lp_type wide_type;
1567 struct lp_build_context wide_bld;
1568 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1569
1570 assert(type.length >= 2);
1571
1572 /*
1573 * Create a wider integer type, enough to hold the
1574 * intermediate result of the multiplication.
1575 */
1576 memset(&wide_type, 0, sizeof wide_type);
1577 wide_type.sign = type.sign;
1578 wide_type.width = type.width*2;
1579 wide_type.length = type.length/2;
1580
1581 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1582
1583 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1584 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1585 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1586
1587 /*
1588 * Lerp both halves.
1589 */
1590
1591 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1592
1593 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1594 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1595
1596 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1597 } else {
1598 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1599 }
1600
1601 return res;
1602 }
1603
1604
1605 /**
1606 * Bilinear interpolation.
1607 *
1608 * Values indices are in v_{yx}.
1609 */
1610 LLVMValueRef
1611 lp_build_lerp_2d(struct lp_build_context *bld,
1612 LLVMValueRef x,
1613 LLVMValueRef y,
1614 LLVMValueRef v00,
1615 LLVMValueRef v01,
1616 LLVMValueRef v10,
1617 LLVMValueRef v11,
1618 unsigned flags)
1619 {
1620 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1621 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1622 return lp_build_lerp(bld, y, v0, v1, flags);
1623 }
1624
1625
1626 LLVMValueRef
1627 lp_build_lerp_3d(struct lp_build_context *bld,
1628 LLVMValueRef x,
1629 LLVMValueRef y,
1630 LLVMValueRef z,
1631 LLVMValueRef v000,
1632 LLVMValueRef v001,
1633 LLVMValueRef v010,
1634 LLVMValueRef v011,
1635 LLVMValueRef v100,
1636 LLVMValueRef v101,
1637 LLVMValueRef v110,
1638 LLVMValueRef v111,
1639 unsigned flags)
1640 {
1641 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1642 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1643 return lp_build_lerp(bld, z, v0, v1, flags);
1644 }
1645
1646
1647 /**
1648 * Generate min(a, b)
1649 * Do checks for special cases but not for nans.
1650 */
1651 LLVMValueRef
1652 lp_build_min(struct lp_build_context *bld,
1653 LLVMValueRef a,
1654 LLVMValueRef b)
1655 {
1656 assert(lp_check_value(bld->type, a));
1657 assert(lp_check_value(bld->type, b));
1658
1659 if(a == bld->undef || b == bld->undef)
1660 return bld->undef;
1661
1662 if(a == b)
1663 return a;
1664
1665 if (bld->type.norm) {
1666 if (!bld->type.sign) {
1667 if (a == bld->zero || b == bld->zero) {
1668 return bld->zero;
1669 }
1670 }
1671 if(a == bld->one)
1672 return b;
1673 if(b == bld->one)
1674 return a;
1675 }
1676
1677 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1678 }
1679
1680
1681 /**
1682 * Generate min(a, b)
1683 * NaN's are handled according to the behavior specified by the
1684 * nan_behavior argument.
1685 */
1686 LLVMValueRef
1687 lp_build_min_ext(struct lp_build_context *bld,
1688 LLVMValueRef a,
1689 LLVMValueRef b,
1690 enum gallivm_nan_behavior nan_behavior)
1691 {
1692 assert(lp_check_value(bld->type, a));
1693 assert(lp_check_value(bld->type, b));
1694
1695 if(a == bld->undef || b == bld->undef)
1696 return bld->undef;
1697
1698 if(a == b)
1699 return a;
1700
1701 if (bld->type.norm) {
1702 if (!bld->type.sign) {
1703 if (a == bld->zero || b == bld->zero) {
1704 return bld->zero;
1705 }
1706 }
1707 if(a == bld->one)
1708 return b;
1709 if(b == bld->one)
1710 return a;
1711 }
1712
1713 return lp_build_min_simple(bld, a, b, nan_behavior);
1714 }
1715
1716 /**
1717 * Generate max(a, b)
1718 * Do checks for special cases, but NaN behavior is undefined.
1719 */
1720 LLVMValueRef
1721 lp_build_max(struct lp_build_context *bld,
1722 LLVMValueRef a,
1723 LLVMValueRef b)
1724 {
1725 assert(lp_check_value(bld->type, a));
1726 assert(lp_check_value(bld->type, b));
1727
1728 if(a == bld->undef || b == bld->undef)
1729 return bld->undef;
1730
1731 if(a == b)
1732 return a;
1733
1734 if(bld->type.norm) {
1735 if(a == bld->one || b == bld->one)
1736 return bld->one;
1737 if (!bld->type.sign) {
1738 if (a == bld->zero) {
1739 return b;
1740 }
1741 if (b == bld->zero) {
1742 return a;
1743 }
1744 }
1745 }
1746
1747 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1748 }
1749
1750
1751 /**
1752 * Generate max(a, b)
1753 * Checks for special cases.
1754 * NaN's are handled according to the behavior specified by the
1755 * nan_behavior argument.
1756 */
1757 LLVMValueRef
1758 lp_build_max_ext(struct lp_build_context *bld,
1759 LLVMValueRef a,
1760 LLVMValueRef b,
1761 enum gallivm_nan_behavior nan_behavior)
1762 {
1763 assert(lp_check_value(bld->type, a));
1764 assert(lp_check_value(bld->type, b));
1765
1766 if(a == bld->undef || b == bld->undef)
1767 return bld->undef;
1768
1769 if(a == b)
1770 return a;
1771
1772 if(bld->type.norm) {
1773 if(a == bld->one || b == bld->one)
1774 return bld->one;
1775 if (!bld->type.sign) {
1776 if (a == bld->zero) {
1777 return b;
1778 }
1779 if (b == bld->zero) {
1780 return a;
1781 }
1782 }
1783 }
1784
1785 return lp_build_max_simple(bld, a, b, nan_behavior);
1786 }
1787
1788 /**
1789 * Generate clamp(a, min, max)
1790 * NaN behavior (for any of a, min, max) is undefined.
1791 * Do checks for special cases.
1792 */
1793 LLVMValueRef
1794 lp_build_clamp(struct lp_build_context *bld,
1795 LLVMValueRef a,
1796 LLVMValueRef min,
1797 LLVMValueRef max)
1798 {
1799 assert(lp_check_value(bld->type, a));
1800 assert(lp_check_value(bld->type, min));
1801 assert(lp_check_value(bld->type, max));
1802
1803 a = lp_build_min(bld, a, max);
1804 a = lp_build_max(bld, a, min);
1805 return a;
1806 }
1807
1808
1809 /**
1810 * Generate clamp(a, 0, 1)
1811 * A NaN will get converted to zero.
1812 */
1813 LLVMValueRef
1814 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1815 LLVMValueRef a)
1816 {
1817 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1818 a = lp_build_min(bld, a, bld->one);
1819 return a;
1820 }
1821
1822
1823 /**
1824 * Generate abs(a)
1825 */
1826 LLVMValueRef
1827 lp_build_abs(struct lp_build_context *bld,
1828 LLVMValueRef a)
1829 {
1830 LLVMBuilderRef builder = bld->gallivm->builder;
1831 const struct lp_type type = bld->type;
1832 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1833
1834 assert(lp_check_value(type, a));
1835
1836 if(!type.sign)
1837 return a;
1838
1839 if(type.floating) {
1840 if ((LLVM_VERSION_MAJOR > 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR > 6)) && (LLVM_VERSION_MAJOR < 3 || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 9))) {
1841 /* Workaround llvm.org/PR27332 */
1842 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1843 unsigned long long absMask = ~(1ULL << (type.width - 1));
1844 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1845 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1846 a = LLVMBuildAnd(builder, a, mask, "");
1847 a = LLVMBuildBitCast(builder, a, vec_type, "");
1848 return a;
1849 } else {
1850 char intrinsic[32];
1851 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1852 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1853 }
1854 }
1855
1856 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1857 switch(type.width) {
1858 case 8:
1859 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1860 case 16:
1861 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1862 case 32:
1863 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1864 }
1865 }
1866 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && LLVM_VERSION_MAJOR < 6) {
1867 switch(type.width) {
1868 case 8:
1869 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1870 case 16:
1871 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1872 case 32:
1873 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1874 }
1875 }
1876
1877 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1878 a, LLVMBuildNeg(builder, a, ""));
1879 }
1880
1881
1882 LLVMValueRef
1883 lp_build_negate(struct lp_build_context *bld,
1884 LLVMValueRef a)
1885 {
1886 LLVMBuilderRef builder = bld->gallivm->builder;
1887
1888 assert(lp_check_value(bld->type, a));
1889
1890 if (bld->type.floating)
1891 a = LLVMBuildFNeg(builder, a, "");
1892 else
1893 a = LLVMBuildNeg(builder, a, "");
1894
1895 return a;
1896 }
1897
1898
1899 /** Return -1, 0 or +1 depending on the sign of a */
1900 LLVMValueRef
1901 lp_build_sgn(struct lp_build_context *bld,
1902 LLVMValueRef a)
1903 {
1904 LLVMBuilderRef builder = bld->gallivm->builder;
1905 const struct lp_type type = bld->type;
1906 LLVMValueRef cond;
1907 LLVMValueRef res;
1908
1909 assert(lp_check_value(type, a));
1910
1911 /* Handle non-zero case */
1912 if(!type.sign) {
1913 /* if not zero then sign must be positive */
1914 res = bld->one;
1915 }
1916 else if(type.floating) {
1917 LLVMTypeRef vec_type;
1918 LLVMTypeRef int_type;
1919 LLVMValueRef mask;
1920 LLVMValueRef sign;
1921 LLVMValueRef one;
1922 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1923
1924 int_type = lp_build_int_vec_type(bld->gallivm, type);
1925 vec_type = lp_build_vec_type(bld->gallivm, type);
1926 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1927
1928 /* Take the sign bit and add it to 1 constant */
1929 sign = LLVMBuildBitCast(builder, a, int_type, "");
1930 sign = LLVMBuildAnd(builder, sign, mask, "");
1931 one = LLVMConstBitCast(bld->one, int_type);
1932 res = LLVMBuildOr(builder, sign, one, "");
1933 res = LLVMBuildBitCast(builder, res, vec_type, "");
1934 }
1935 else
1936 {
1937 /* signed int/norm/fixed point */
1938 /* could use psign with sse3 and appropriate vectors here */
1939 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1940 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1941 res = lp_build_select(bld, cond, bld->one, minus_one);
1942 }
1943
1944 /* Handle zero */
1945 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1946 res = lp_build_select(bld, cond, bld->zero, res);
1947
1948 return res;
1949 }
1950
1951
1952 /**
1953 * Set the sign of float vector 'a' according to 'sign'.
1954 * If sign==0, return abs(a).
1955 * If sign==1, return -abs(a);
1956 * Other values for sign produce undefined results.
1957 */
1958 LLVMValueRef
1959 lp_build_set_sign(struct lp_build_context *bld,
1960 LLVMValueRef a, LLVMValueRef sign)
1961 {
1962 LLVMBuilderRef builder = bld->gallivm->builder;
1963 const struct lp_type type = bld->type;
1964 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1965 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1966 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1967 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1968 ~((unsigned long long) 1 << (type.width - 1)));
1969 LLVMValueRef val, res;
1970
1971 assert(type.floating);
1972 assert(lp_check_value(type, a));
1973
1974 /* val = reinterpret_cast<int>(a) */
1975 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1976 /* val = val & mask */
1977 val = LLVMBuildAnd(builder, val, mask, "");
1978 /* sign = sign << shift */
1979 sign = LLVMBuildShl(builder, sign, shift, "");
1980 /* res = val | sign */
1981 res = LLVMBuildOr(builder, val, sign, "");
1982 /* res = reinterpret_cast<float>(res) */
1983 res = LLVMBuildBitCast(builder, res, vec_type, "");
1984
1985 return res;
1986 }
1987
1988
1989 /**
1990 * Convert vector of (or scalar) int to vector of (or scalar) float.
1991 */
1992 LLVMValueRef
1993 lp_build_int_to_float(struct lp_build_context *bld,
1994 LLVMValueRef a)
1995 {
1996 LLVMBuilderRef builder = bld->gallivm->builder;
1997 const struct lp_type type = bld->type;
1998 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1999
2000 assert(type.floating);
2001
2002 return LLVMBuildSIToFP(builder, a, vec_type, "");
2003 }
2004
2005 static boolean
2006 arch_rounding_available(const struct lp_type type)
2007 {
2008 if ((util_cpu_caps.has_sse4_1 &&
2009 (type.length == 1 || type.width*type.length == 128)) ||
2010 (util_cpu_caps.has_avx && type.width*type.length == 256) ||
2011 (util_cpu_caps.has_avx512f && type.width*type.length == 512))
2012 return TRUE;
2013 else if ((util_cpu_caps.has_altivec &&
2014 (type.width == 32 && type.length == 4)))
2015 return TRUE;
2016 else if (util_cpu_caps.has_neon)
2017 return TRUE;
2018
2019 return FALSE;
2020 }
2021
2022 enum lp_build_round_mode
2023 {
2024 LP_BUILD_ROUND_NEAREST = 0,
2025 LP_BUILD_ROUND_FLOOR = 1,
2026 LP_BUILD_ROUND_CEIL = 2,
2027 LP_BUILD_ROUND_TRUNCATE = 3
2028 };
2029
2030 static inline LLVMValueRef
2031 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2032 LLVMValueRef a)
2033 {
2034 LLVMBuilderRef builder = bld->gallivm->builder;
2035 const struct lp_type type = bld->type;
2036 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2037 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2038 const char *intrinsic;
2039 LLVMValueRef res;
2040
2041 assert(type.floating);
2042 /* using the double precision conversions is a bit more complicated */
2043 assert(type.width == 32);
2044
2045 assert(lp_check_value(type, a));
2046 assert(util_cpu_caps.has_sse2);
2047
2048 /* This is relying on MXCSR rounding mode, which should always be nearest. */
2049 if (type.length == 1) {
2050 LLVMTypeRef vec_type;
2051 LLVMValueRef undef;
2052 LLVMValueRef arg;
2053 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2054
2055 vec_type = LLVMVectorType(bld->elem_type, 4);
2056
2057 intrinsic = "llvm.x86.sse.cvtss2si";
2058
2059 undef = LLVMGetUndef(vec_type);
2060
2061 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2062
2063 res = lp_build_intrinsic_unary(builder, intrinsic,
2064 ret_type, arg);
2065 }
2066 else {
2067 if (type.width* type.length == 128) {
2068 intrinsic = "llvm.x86.sse2.cvtps2dq";
2069 }
2070 else {
2071 assert(type.width*type.length == 256);
2072 assert(util_cpu_caps.has_avx);
2073
2074 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2075 }
2076 res = lp_build_intrinsic_unary(builder, intrinsic,
2077 ret_type, a);
2078 }
2079
2080 return res;
2081 }
2082
2083
2084 /*
2085 */
2086 static inline LLVMValueRef
2087 lp_build_round_altivec(struct lp_build_context *bld,
2088 LLVMValueRef a,
2089 enum lp_build_round_mode mode)
2090 {
2091 LLVMBuilderRef builder = bld->gallivm->builder;
2092 const struct lp_type type = bld->type;
2093 const char *intrinsic = NULL;
2094
2095 assert(type.floating);
2096
2097 assert(lp_check_value(type, a));
2098 assert(util_cpu_caps.has_altivec);
2099
2100 (void)type;
2101
2102 switch (mode) {
2103 case LP_BUILD_ROUND_NEAREST:
2104 intrinsic = "llvm.ppc.altivec.vrfin";
2105 break;
2106 case LP_BUILD_ROUND_FLOOR:
2107 intrinsic = "llvm.ppc.altivec.vrfim";
2108 break;
2109 case LP_BUILD_ROUND_CEIL:
2110 intrinsic = "llvm.ppc.altivec.vrfip";
2111 break;
2112 case LP_BUILD_ROUND_TRUNCATE:
2113 intrinsic = "llvm.ppc.altivec.vrfiz";
2114 break;
2115 }
2116
2117 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2118 }
2119
2120 static inline LLVMValueRef
2121 lp_build_round_arch(struct lp_build_context *bld,
2122 LLVMValueRef a,
2123 enum lp_build_round_mode mode)
2124 {
2125 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2126 LLVMBuilderRef builder = bld->gallivm->builder;
2127 const struct lp_type type = bld->type;
2128 const char *intrinsic_root;
2129 char intrinsic[32];
2130
2131 assert(type.floating);
2132 assert(lp_check_value(type, a));
2133 (void)type;
2134
2135 switch (mode) {
2136 case LP_BUILD_ROUND_NEAREST:
2137 intrinsic_root = "llvm.nearbyint";
2138 break;
2139 case LP_BUILD_ROUND_FLOOR:
2140 intrinsic_root = "llvm.floor";
2141 break;
2142 case LP_BUILD_ROUND_CEIL:
2143 intrinsic_root = "llvm.ceil";
2144 break;
2145 case LP_BUILD_ROUND_TRUNCATE:
2146 intrinsic_root = "llvm.trunc";
2147 break;
2148 }
2149
2150 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2151 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2152 }
2153 else /* (util_cpu_caps.has_altivec) */
2154 return lp_build_round_altivec(bld, a, mode);
2155 }
2156
2157 /**
2158 * Return the integer part of a float (vector) value (== round toward zero).
2159 * The returned value is a float (vector).
2160 * Ex: trunc(-1.5) = -1.0
2161 */
2162 LLVMValueRef
2163 lp_build_trunc(struct lp_build_context *bld,
2164 LLVMValueRef a)
2165 {
2166 LLVMBuilderRef builder = bld->gallivm->builder;
2167 const struct lp_type type = bld->type;
2168
2169 assert(type.floating);
2170 assert(lp_check_value(type, a));
2171
2172 if (arch_rounding_available(type)) {
2173 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2174 }
2175 else {
2176 const struct lp_type type = bld->type;
2177 struct lp_type inttype;
2178 struct lp_build_context intbld;
2179 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2180 LLVMValueRef trunc, res, anosign, mask;
2181 LLVMTypeRef int_vec_type = bld->int_vec_type;
2182 LLVMTypeRef vec_type = bld->vec_type;
2183
2184 assert(type.width == 32); /* might want to handle doubles at some point */
2185
2186 inttype = type;
2187 inttype.floating = 0;
2188 lp_build_context_init(&intbld, bld->gallivm, inttype);
2189
2190 /* round by truncation */
2191 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2192 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2193
2194 /* mask out sign bit */
2195 anosign = lp_build_abs(bld, a);
2196 /*
2197 * mask out all values if anosign > 2^24
2198 * This should work both for large ints (all rounding is no-op for them
2199 * because such floats are always exact) as well as special cases like
2200 * NaNs, Infs (taking advantage of the fact they use max exponent).
2201 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2202 */
2203 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2204 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2205 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2206 return lp_build_select(bld, mask, a, res);
2207 }
2208 }
2209
2210
2211 /**
2212 * Return float (vector) rounded to nearest integer (vector). The returned
2213 * value is a float (vector).
2214 * Ex: round(0.9) = 1.0
2215 * Ex: round(-1.5) = -2.0
2216 */
2217 LLVMValueRef
2218 lp_build_round(struct lp_build_context *bld,
2219 LLVMValueRef a)
2220 {
2221 LLVMBuilderRef builder = bld->gallivm->builder;
2222 const struct lp_type type = bld->type;
2223
2224 assert(type.floating);
2225 assert(lp_check_value(type, a));
2226
2227 if (arch_rounding_available(type)) {
2228 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2229 }
2230 else {
2231 const struct lp_type type = bld->type;
2232 struct lp_type inttype;
2233 struct lp_build_context intbld;
2234 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2235 LLVMValueRef res, anosign, mask;
2236 LLVMTypeRef int_vec_type = bld->int_vec_type;
2237 LLVMTypeRef vec_type = bld->vec_type;
2238
2239 assert(type.width == 32); /* might want to handle doubles at some point */
2240
2241 inttype = type;
2242 inttype.floating = 0;
2243 lp_build_context_init(&intbld, bld->gallivm, inttype);
2244
2245 res = lp_build_iround(bld, a);
2246 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2247
2248 /* mask out sign bit */
2249 anosign = lp_build_abs(bld, a);
2250 /*
2251 * mask out all values if anosign > 2^24
2252 * This should work both for large ints (all rounding is no-op for them
2253 * because such floats are always exact) as well as special cases like
2254 * NaNs, Infs (taking advantage of the fact they use max exponent).
2255 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2256 */
2257 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2258 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2259 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2260 return lp_build_select(bld, mask, a, res);
2261 }
2262 }
2263
2264
2265 /**
2266 * Return floor of float (vector), result is a float (vector)
2267 * Ex: floor(1.1) = 1.0
2268 * Ex: floor(-1.1) = -2.0
2269 */
2270 LLVMValueRef
2271 lp_build_floor(struct lp_build_context *bld,
2272 LLVMValueRef a)
2273 {
2274 LLVMBuilderRef builder = bld->gallivm->builder;
2275 const struct lp_type type = bld->type;
2276
2277 assert(type.floating);
2278 assert(lp_check_value(type, a));
2279
2280 if (arch_rounding_available(type)) {
2281 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2282 }
2283 else {
2284 const struct lp_type type = bld->type;
2285 struct lp_type inttype;
2286 struct lp_build_context intbld;
2287 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2288 LLVMValueRef trunc, res, anosign, mask;
2289 LLVMTypeRef int_vec_type = bld->int_vec_type;
2290 LLVMTypeRef vec_type = bld->vec_type;
2291
2292 if (type.width != 32) {
2293 char intrinsic[32];
2294 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2295 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2296 }
2297
2298 assert(type.width == 32); /* might want to handle doubles at some point */
2299
2300 inttype = type;
2301 inttype.floating = 0;
2302 lp_build_context_init(&intbld, bld->gallivm, inttype);
2303
2304 /* round by truncation */
2305 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2306 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2307
2308 if (type.sign) {
2309 LLVMValueRef tmp;
2310
2311 /*
2312 * fix values if rounding is wrong (for non-special cases)
2313 * - this is the case if trunc > a
2314 */
2315 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2316 /* tmp = trunc > a ? 1.0 : 0.0 */
2317 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2318 tmp = lp_build_and(&intbld, mask, tmp);
2319 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2320 res = lp_build_sub(bld, res, tmp);
2321 }
2322
2323 /* mask out sign bit */
2324 anosign = lp_build_abs(bld, a);
2325 /*
2326 * mask out all values if anosign > 2^24
2327 * This should work both for large ints (all rounding is no-op for them
2328 * because such floats are always exact) as well as special cases like
2329 * NaNs, Infs (taking advantage of the fact they use max exponent).
2330 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2331 */
2332 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2333 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2334 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2335 return lp_build_select(bld, mask, a, res);
2336 }
2337 }
2338
2339
2340 /**
2341 * Return ceiling of float (vector), returning float (vector).
2342 * Ex: ceil( 1.1) = 2.0
2343 * Ex: ceil(-1.1) = -1.0
2344 */
2345 LLVMValueRef
2346 lp_build_ceil(struct lp_build_context *bld,
2347 LLVMValueRef a)
2348 {
2349 LLVMBuilderRef builder = bld->gallivm->builder;
2350 const struct lp_type type = bld->type;
2351
2352 assert(type.floating);
2353 assert(lp_check_value(type, a));
2354
2355 if (arch_rounding_available(type)) {
2356 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2357 }
2358 else {
2359 const struct lp_type type = bld->type;
2360 struct lp_type inttype;
2361 struct lp_build_context intbld;
2362 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2363 LLVMValueRef trunc, res, anosign, mask, tmp;
2364 LLVMTypeRef int_vec_type = bld->int_vec_type;
2365 LLVMTypeRef vec_type = bld->vec_type;
2366
2367 if (type.width != 32) {
2368 char intrinsic[32];
2369 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2370 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2371 }
2372
2373 assert(type.width == 32); /* might want to handle doubles at some point */
2374
2375 inttype = type;
2376 inttype.floating = 0;
2377 lp_build_context_init(&intbld, bld->gallivm, inttype);
2378
2379 /* round by truncation */
2380 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2381 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2382
2383 /*
2384 * fix values if rounding is wrong (for non-special cases)
2385 * - this is the case if trunc < a
2386 */
2387 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2388 /* tmp = trunc < a ? 1.0 : 0.0 */
2389 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2390 tmp = lp_build_and(&intbld, mask, tmp);
2391 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2392 res = lp_build_add(bld, trunc, tmp);
2393
2394 /* mask out sign bit */
2395 anosign = lp_build_abs(bld, a);
2396 /*
2397 * mask out all values if anosign > 2^24
2398 * This should work both for large ints (all rounding is no-op for them
2399 * because such floats are always exact) as well as special cases like
2400 * NaNs, Infs (taking advantage of the fact they use max exponent).
2401 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2402 */
2403 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2404 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2405 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2406 return lp_build_select(bld, mask, a, res);
2407 }
2408 }
2409
2410
2411 /**
2412 * Return fractional part of 'a' computed as a - floor(a)
2413 * Typically used in texture coord arithmetic.
2414 */
2415 LLVMValueRef
2416 lp_build_fract(struct lp_build_context *bld,
2417 LLVMValueRef a)
2418 {
2419 assert(bld->type.floating);
2420 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2421 }
2422
2423
2424 /**
2425 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2426 * against 0.99999(9). (Will also return that value for NaNs.)
2427 */
2428 static inline LLVMValueRef
2429 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2430 {
2431 LLVMValueRef max;
2432
2433 /* this is the largest number smaller than 1.0 representable as float */
2434 max = lp_build_const_vec(bld->gallivm, bld->type,
2435 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2436 return lp_build_min_ext(bld, fract, max,
2437 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2438 }
2439
2440
2441 /**
2442 * Same as lp_build_fract, but guarantees that the result is always smaller
2443 * than one. Will also return the smaller-than-one value for infs, NaNs.
2444 */
2445 LLVMValueRef
2446 lp_build_fract_safe(struct lp_build_context *bld,
2447 LLVMValueRef a)
2448 {
2449 return clamp_fract(bld, lp_build_fract(bld, a));
2450 }
2451
2452
2453 /**
2454 * Return the integer part of a float (vector) value (== round toward zero).
2455 * The returned value is an integer (vector).
2456 * Ex: itrunc(-1.5) = -1
2457 */
2458 LLVMValueRef
2459 lp_build_itrunc(struct lp_build_context *bld,
2460 LLVMValueRef a)
2461 {
2462 LLVMBuilderRef builder = bld->gallivm->builder;
2463 const struct lp_type type = bld->type;
2464 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2465
2466 assert(type.floating);
2467 assert(lp_check_value(type, a));
2468
2469 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2470 }
2471
2472
2473 /**
2474 * Return float (vector) rounded to nearest integer (vector). The returned
2475 * value is an integer (vector).
2476 * Ex: iround(0.9) = 1
2477 * Ex: iround(-1.5) = -2
2478 */
2479 LLVMValueRef
2480 lp_build_iround(struct lp_build_context *bld,
2481 LLVMValueRef a)
2482 {
2483 LLVMBuilderRef builder = bld->gallivm->builder;
2484 const struct lp_type type = bld->type;
2485 LLVMTypeRef int_vec_type = bld->int_vec_type;
2486 LLVMValueRef res;
2487
2488 assert(type.floating);
2489
2490 assert(lp_check_value(type, a));
2491
2492 if ((util_cpu_caps.has_sse2 &&
2493 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2494 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2495 return lp_build_iround_nearest_sse2(bld, a);
2496 }
2497 if (arch_rounding_available(type)) {
2498 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2499 }
2500 else {
2501 LLVMValueRef half;
2502
2503 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2504
2505 if (type.sign) {
2506 LLVMTypeRef vec_type = bld->vec_type;
2507 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2508 (unsigned long long)1 << (type.width - 1));
2509 LLVMValueRef sign;
2510
2511 /* get sign bit */
2512 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2513 sign = LLVMBuildAnd(builder, sign, mask, "");
2514
2515 /* sign * 0.5 */
2516 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2517 half = LLVMBuildOr(builder, sign, half, "");
2518 half = LLVMBuildBitCast(builder, half, vec_type, "");
2519 }
2520
2521 res = LLVMBuildFAdd(builder, a, half, "");
2522 }
2523
2524 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2525
2526 return res;
2527 }
2528
2529
2530 /**
2531 * Return floor of float (vector), result is an int (vector)
2532 * Ex: ifloor(1.1) = 1.0
2533 * Ex: ifloor(-1.1) = -2.0
2534 */
2535 LLVMValueRef
2536 lp_build_ifloor(struct lp_build_context *bld,
2537 LLVMValueRef a)
2538 {
2539 LLVMBuilderRef builder = bld->gallivm->builder;
2540 const struct lp_type type = bld->type;
2541 LLVMTypeRef int_vec_type = bld->int_vec_type;
2542 LLVMValueRef res;
2543
2544 assert(type.floating);
2545 assert(lp_check_value(type, a));
2546
2547 res = a;
2548 if (type.sign) {
2549 if (arch_rounding_available(type)) {
2550 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2551 }
2552 else {
2553 struct lp_type inttype;
2554 struct lp_build_context intbld;
2555 LLVMValueRef trunc, itrunc, mask;
2556
2557 assert(type.floating);
2558 assert(lp_check_value(type, a));
2559
2560 inttype = type;
2561 inttype.floating = 0;
2562 lp_build_context_init(&intbld, bld->gallivm, inttype);
2563
2564 /* round by truncation */
2565 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2566 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2567
2568 /*
2569 * fix values if rounding is wrong (for non-special cases)
2570 * - this is the case if trunc > a
2571 * The results of doing this with NaNs, very large values etc.
2572 * are undefined but this seems to be the case anyway.
2573 */
2574 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2575 /* cheapie minus one with mask since the mask is minus one / zero */
2576 return lp_build_add(&intbld, itrunc, mask);
2577 }
2578 }
2579
2580 /* round to nearest (toward zero) */
2581 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2582
2583 return res;
2584 }
2585
2586
2587 /**
2588 * Return ceiling of float (vector), returning int (vector).
2589 * Ex: iceil( 1.1) = 2
2590 * Ex: iceil(-1.1) = -1
2591 */
2592 LLVMValueRef
2593 lp_build_iceil(struct lp_build_context *bld,
2594 LLVMValueRef a)
2595 {
2596 LLVMBuilderRef builder = bld->gallivm->builder;
2597 const struct lp_type type = bld->type;
2598 LLVMTypeRef int_vec_type = bld->int_vec_type;
2599 LLVMValueRef res;
2600
2601 assert(type.floating);
2602 assert(lp_check_value(type, a));
2603
2604 if (arch_rounding_available(type)) {
2605 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2606 }
2607 else {
2608 struct lp_type inttype;
2609 struct lp_build_context intbld;
2610 LLVMValueRef trunc, itrunc, mask;
2611
2612 assert(type.floating);
2613 assert(lp_check_value(type, a));
2614
2615 inttype = type;
2616 inttype.floating = 0;
2617 lp_build_context_init(&intbld, bld->gallivm, inttype);
2618
2619 /* round by truncation */
2620 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2621 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2622
2623 /*
2624 * fix values if rounding is wrong (for non-special cases)
2625 * - this is the case if trunc < a
2626 * The results of doing this with NaNs, very large values etc.
2627 * are undefined but this seems to be the case anyway.
2628 */
2629 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2630 /* cheapie plus one with mask since the mask is minus one / zero */
2631 return lp_build_sub(&intbld, itrunc, mask);
2632 }
2633
2634 /* round to nearest (toward zero) */
2635 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2636
2637 return res;
2638 }
2639
2640
2641 /**
2642 * Combined ifloor() & fract().
2643 *
2644 * Preferred to calling the functions separately, as it will ensure that the
2645 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2646 */
2647 void
2648 lp_build_ifloor_fract(struct lp_build_context *bld,
2649 LLVMValueRef a,
2650 LLVMValueRef *out_ipart,
2651 LLVMValueRef *out_fpart)
2652 {
2653 LLVMBuilderRef builder = bld->gallivm->builder;
2654 const struct lp_type type = bld->type;
2655 LLVMValueRef ipart;
2656
2657 assert(type.floating);
2658 assert(lp_check_value(type, a));
2659
2660 if (arch_rounding_available(type)) {
2661 /*
2662 * floor() is easier.
2663 */
2664
2665 ipart = lp_build_floor(bld, a);
2666 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2667 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2668 }
2669 else {
2670 /*
2671 * ifloor() is easier.
2672 */
2673
2674 *out_ipart = lp_build_ifloor(bld, a);
2675 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2676 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2677 }
2678 }
2679
2680
2681 /**
2682 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2683 * always smaller than one.
2684 */
2685 void
2686 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2687 LLVMValueRef a,
2688 LLVMValueRef *out_ipart,
2689 LLVMValueRef *out_fpart)
2690 {
2691 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2692 *out_fpart = clamp_fract(bld, *out_fpart);
2693 }
2694
2695
2696 LLVMValueRef
2697 lp_build_sqrt(struct lp_build_context *bld,
2698 LLVMValueRef a)
2699 {
2700 LLVMBuilderRef builder = bld->gallivm->builder;
2701 const struct lp_type type = bld->type;
2702 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2703 char intrinsic[32];
2704
2705 assert(lp_check_value(type, a));
2706
2707 assert(type.floating);
2708 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2709
2710 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2711 }
2712
2713
2714 /**
2715 * Do one Newton-Raphson step to improve reciprocate precision:
2716 *
2717 * x_{i+1} = x_i + x_i * (1 - a * x_i)
2718 *
2719 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2720 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2721 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2722 * halo. It would be necessary to clamp the argument to prevent this.
2723 *
2724 * See also:
2725 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2726 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2727 */
2728 static inline LLVMValueRef
2729 lp_build_rcp_refine(struct lp_build_context *bld,
2730 LLVMValueRef a,
2731 LLVMValueRef rcp_a)
2732 {
2733 LLVMBuilderRef builder = bld->gallivm->builder;
2734 LLVMValueRef neg_a;
2735 LLVMValueRef res;
2736
2737 neg_a = LLVMBuildFNeg(builder, a, "");
2738 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2739 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2740
2741 return res;
2742 }
2743
2744
2745 LLVMValueRef
2746 lp_build_rcp(struct lp_build_context *bld,
2747 LLVMValueRef a)
2748 {
2749 LLVMBuilderRef builder = bld->gallivm->builder;
2750 const struct lp_type type = bld->type;
2751
2752 assert(lp_check_value(type, a));
2753
2754 if(a == bld->zero)
2755 return bld->undef;
2756 if(a == bld->one)
2757 return bld->one;
2758 if(a == bld->undef)
2759 return bld->undef;
2760
2761 assert(type.floating);
2762
2763 if(LLVMIsConstant(a))
2764 return LLVMConstFDiv(bld->one, a);
2765
2766 /*
2767 * We don't use RCPPS because:
2768 * - it only has 10bits of precision
2769 * - it doesn't even get the reciprocate of 1.0 exactly
2770 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2771 * - for recent processors the benefit over DIVPS is marginal, a case
2772 * dependent
2773 *
2774 * We could still use it on certain processors if benchmarks show that the
2775 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2776 * particular uses that require less workarounds.
2777 */
2778
2779 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2780 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2781 const unsigned num_iterations = 0;
2782 LLVMValueRef res;
2783 unsigned i;
2784 const char *intrinsic = NULL;
2785
2786 if (type.length == 4) {
2787 intrinsic = "llvm.x86.sse.rcp.ps";
2788 }
2789 else {
2790 intrinsic = "llvm.x86.avx.rcp.ps.256";
2791 }
2792
2793 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2794
2795 for (i = 0; i < num_iterations; ++i) {
2796 res = lp_build_rcp_refine(bld, a, res);
2797 }
2798
2799 return res;
2800 }
2801
2802 return LLVMBuildFDiv(builder, bld->one, a, "");
2803 }
2804
2805
2806 /**
2807 * Do one Newton-Raphson step to improve rsqrt precision:
2808 *
2809 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2810 *
2811 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2812 */
2813 static inline LLVMValueRef
2814 lp_build_rsqrt_refine(struct lp_build_context *bld,
2815 LLVMValueRef a,
2816 LLVMValueRef rsqrt_a)
2817 {
2818 LLVMBuilderRef builder = bld->gallivm->builder;
2819 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2820 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2821 LLVMValueRef res;
2822
2823 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2824 res = LLVMBuildFMul(builder, a, res, "");
2825 res = LLVMBuildFSub(builder, three, res, "");
2826 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2827 res = LLVMBuildFMul(builder, half, res, "");
2828
2829 return res;
2830 }
2831
2832
2833 /**
2834 * Generate 1/sqrt(a).
2835 * Result is undefined for values < 0, infinity for +0.
2836 */
2837 LLVMValueRef
2838 lp_build_rsqrt(struct lp_build_context *bld,
2839 LLVMValueRef a)
2840 {
2841 const struct lp_type type = bld->type;
2842
2843 assert(lp_check_value(type, a));
2844
2845 assert(type.floating);
2846
2847 /*
2848 * This should be faster but all denormals will end up as infinity.
2849 */
2850 if (0 && lp_build_fast_rsqrt_available(type)) {
2851 const unsigned num_iterations = 1;
2852 LLVMValueRef res;
2853 unsigned i;
2854
2855 /* rsqrt(1.0) != 1.0 here */
2856 res = lp_build_fast_rsqrt(bld, a);
2857
2858 if (num_iterations) {
2859 /*
2860 * Newton-Raphson will result in NaN instead of infinity for zero,
2861 * and NaN instead of zero for infinity.
2862 * Also, need to ensure rsqrt(1.0) == 1.0.
2863 * All numbers smaller than FLT_MIN will result in +infinity
2864 * (rsqrtps treats all denormals as zero).
2865 */
2866 LLVMValueRef cmp;
2867 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2868 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2869
2870 for (i = 0; i < num_iterations; ++i) {
2871 res = lp_build_rsqrt_refine(bld, a, res);
2872 }
2873 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2874 res = lp_build_select(bld, cmp, inf, res);
2875 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2876 res = lp_build_select(bld, cmp, bld->zero, res);
2877 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2878 res = lp_build_select(bld, cmp, bld->one, res);
2879 }
2880
2881 return res;
2882 }
2883
2884 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2885 }
2886
2887 /**
2888 * If there's a fast (inaccurate) rsqrt instruction available
2889 * (caller may want to avoid to call rsqrt_fast if it's not available,
2890 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2891 * unavailable it would result in sqrt/div/mul so obviously
2892 * much better to just call sqrt, skipping both div and mul).
2893 */
2894 boolean
2895 lp_build_fast_rsqrt_available(struct lp_type type)
2896 {
2897 assert(type.floating);
2898
2899 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2900 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2901 return true;
2902 }
2903 return false;
2904 }
2905
2906
2907 /**
2908 * Generate 1/sqrt(a).
2909 * Result is undefined for values < 0, infinity for +0.
2910 * Precision is limited, only ~10 bits guaranteed
2911 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2912 */
2913 LLVMValueRef
2914 lp_build_fast_rsqrt(struct lp_build_context *bld,
2915 LLVMValueRef a)
2916 {
2917 LLVMBuilderRef builder = bld->gallivm->builder;
2918 const struct lp_type type = bld->type;
2919
2920 assert(lp_check_value(type, a));
2921
2922 if (lp_build_fast_rsqrt_available(type)) {
2923 const char *intrinsic = NULL;
2924
2925 if (type.length == 4) {
2926 intrinsic = "llvm.x86.sse.rsqrt.ps";
2927 }
2928 else {
2929 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2930 }
2931 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2932 }
2933 else {
2934 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2935 }
2936 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2937 }
2938
2939
2940 /**
2941 * Generate sin(a) or cos(a) using polynomial approximation.
2942 * TODO: it might be worth recognizing sin and cos using same source
2943 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2944 * would be way cheaper than calculating (nearly) everything twice...
2945 * Not sure it's common enough to be worth bothering however, scs
2946 * opcode could also benefit from calculating both though.
2947 */
2948 static LLVMValueRef
2949 lp_build_sin_or_cos(struct lp_build_context *bld,
2950 LLVMValueRef a,
2951 boolean cos)
2952 {
2953 struct gallivm_state *gallivm = bld->gallivm;
2954 LLVMBuilderRef b = gallivm->builder;
2955 struct lp_type int_type = lp_int_type(bld->type);
2956
2957 /*
2958 * take the absolute value,
2959 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2960 */
2961
2962 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2963 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2964
2965 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2966 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2967
2968 /*
2969 * scale by 4/Pi
2970 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2971 */
2972
2973 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2974 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2975
2976 /*
2977 * store the integer part of y in mm0
2978 * emm2 = _mm_cvttps_epi32(y);
2979 */
2980
2981 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2982
2983 /*
2984 * j=(j+1) & (~1) (see the cephes sources)
2985 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2986 */
2987
2988 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2989 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2990 /*
2991 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2992 */
2993 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2994 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2995
2996 /*
2997 * y = _mm_cvtepi32_ps(emm2);
2998 */
2999 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
3000
3001 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
3002 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
3003 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
3004 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
3005
3006 /*
3007 * Argument used for poly selection and sign bit determination
3008 * is different for sin vs. cos.
3009 */
3010 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
3011 emm2_and;
3012
3013 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
3014 LLVMBuildNot(b, emm2_2, ""), ""),
3015 const_29, "sign_bit") :
3016 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
3017 LLVMBuildShl(b, emm2_add,
3018 const_29, ""), ""),
3019 sign_mask, "sign_bit");
3020
3021 /*
3022 * get the polynom selection mask
3023 * there is one polynom for 0 <= x <= Pi/4
3024 * and another one for Pi/4<x<=Pi/2
3025 * Both branches will be computed.
3026 *
3027 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3028 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3029 */
3030
3031 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3032 LLVMValueRef poly_mask = lp_build_compare(gallivm,
3033 int_type, PIPE_FUNC_EQUAL,
3034 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3035
3036 /*
3037 * _PS_CONST(minus_cephes_DP1, -0.78515625);
3038 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3039 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3040 */
3041 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3042 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3043 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3044
3045 /*
3046 * The magic pass: "Extended precision modular arithmetic"
3047 * x = ((x - y * DP1) - y * DP2) - y * DP3;
3048 */
3049 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3050 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3051 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3052
3053 /*
3054 * Evaluate the first polynom (0 <= x <= Pi/4)
3055 *
3056 * z = _mm_mul_ps(x,x);
3057 */
3058 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3059
3060 /*
3061 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3062 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3063 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3064 */
3065 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3066 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3067 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3068
3069 /*
3070 * y = *(v4sf*)_ps_coscof_p0;
3071 * y = _mm_mul_ps(y, z);
3072 */
3073 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3074 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3075 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3076 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3077
3078
3079 /*
3080 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3081 * y = _mm_sub_ps(y, tmp);
3082 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3083 */
3084 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3085 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3086 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3087 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3088 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3089
3090 /*
3091 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3092 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3093 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3094 */
3095 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3096 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3097 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3098
3099 /*
3100 * Evaluate the second polynom (Pi/4 <= x <= 0)
3101 *
3102 * y2 = *(v4sf*)_ps_sincof_p0;
3103 * y2 = _mm_mul_ps(y2, z);
3104 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3105 * y2 = _mm_mul_ps(y2, z);
3106 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3107 * y2 = _mm_mul_ps(y2, z);
3108 * y2 = _mm_mul_ps(y2, x);
3109 * y2 = _mm_add_ps(y2, x);
3110 */
3111
3112 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3113 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3114 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3115 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3116
3117 /*
3118 * select the correct result from the two polynoms
3119 * xmm3 = poly_mask;
3120 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3121 * y = _mm_andnot_ps(xmm3, y);
3122 * y = _mm_or_ps(y,y2);
3123 */
3124 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3125 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3126 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3127 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3128 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3129 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3130
3131 /*
3132 * update the sign
3133 * y = _mm_xor_ps(y, sign_bit);
3134 */
3135 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3136 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3137
3138 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3139
3140 /* clamp output to be within [-1, 1] */
3141 y_result = lp_build_clamp(bld, y_result,
3142 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3143 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3144 /* If a is -inf, inf or NaN then return NaN */
3145 y_result = lp_build_select(bld, isfinite, y_result,
3146 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3147 return y_result;
3148 }
3149
3150
3151 /**
3152 * Generate sin(a)
3153 */
3154 LLVMValueRef
3155 lp_build_sin(struct lp_build_context *bld,
3156 LLVMValueRef a)
3157 {
3158 return lp_build_sin_or_cos(bld, a, FALSE);
3159 }
3160
3161
3162 /**
3163 * Generate cos(a)
3164 */
3165 LLVMValueRef
3166 lp_build_cos(struct lp_build_context *bld,
3167 LLVMValueRef a)
3168 {
3169 return lp_build_sin_or_cos(bld, a, TRUE);
3170 }
3171
3172
3173 /**
3174 * Generate pow(x, y)
3175 */
3176 LLVMValueRef
3177 lp_build_pow(struct lp_build_context *bld,
3178 LLVMValueRef x,
3179 LLVMValueRef y)
3180 {
3181 /* TODO: optimize the constant case */
3182 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3183 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3184 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3185 __FUNCTION__);
3186 }
3187
3188 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3189 }
3190
3191
3192 /**
3193 * Generate exp(x)
3194 */
3195 LLVMValueRef
3196 lp_build_exp(struct lp_build_context *bld,
3197 LLVMValueRef x)
3198 {
3199 /* log2(e) = 1/log(2) */
3200 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3201 1.4426950408889634);
3202
3203 assert(lp_check_value(bld->type, x));
3204
3205 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3206 }
3207
3208
3209 /**
3210 * Generate log(x)
3211 * Behavior is undefined with infs, 0s and nans
3212 */
3213 LLVMValueRef
3214 lp_build_log(struct lp_build_context *bld,
3215 LLVMValueRef x)
3216 {
3217 /* log(2) */
3218 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3219 0.69314718055994529);
3220
3221 assert(lp_check_value(bld->type, x));
3222
3223 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3224 }
3225
3226 /**
3227 * Generate log(x) that handles edge cases (infs, 0s and nans)
3228 */
3229 LLVMValueRef
3230 lp_build_log_safe(struct lp_build_context *bld,
3231 LLVMValueRef x)
3232 {
3233 /* log(2) */
3234 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3235 0.69314718055994529);
3236
3237 assert(lp_check_value(bld->type, x));
3238
3239 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3240 }
3241
3242
3243 /**
3244 * Generate polynomial.
3245 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3246 */
3247 LLVMValueRef
3248 lp_build_polynomial(struct lp_build_context *bld,
3249 LLVMValueRef x,
3250 const double *coeffs,
3251 unsigned num_coeffs)
3252 {
3253 const struct lp_type type = bld->type;
3254 LLVMValueRef even = NULL, odd = NULL;
3255 LLVMValueRef x2;
3256 unsigned i;
3257
3258 assert(lp_check_value(bld->type, x));
3259
3260 /* TODO: optimize the constant case */
3261 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3262 LLVMIsConstant(x)) {
3263 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3264 __FUNCTION__);
3265 }
3266
3267 /*
3268 * Calculate odd and even terms seperately to decrease data dependency
3269 * Ex:
3270 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3271 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3272 */
3273 x2 = lp_build_mul(bld, x, x);
3274
3275 for (i = num_coeffs; i--; ) {
3276 LLVMValueRef coeff;
3277
3278 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3279
3280 if (i % 2 == 0) {
3281 if (even)
3282 even = lp_build_mad(bld, x2, even, coeff);
3283 else
3284 even = coeff;
3285 } else {
3286 if (odd)
3287 odd = lp_build_mad(bld, x2, odd, coeff);
3288 else
3289 odd = coeff;
3290 }
3291 }
3292
3293 if (odd)
3294 return lp_build_mad(bld, odd, x, even);
3295 else if (even)
3296 return even;
3297 else
3298 return bld->undef;
3299 }
3300
3301
3302 /**
3303 * Minimax polynomial fit of 2**x, in range [0, 1[
3304 */
3305 const double lp_build_exp2_polynomial[] = {
3306 #if EXP_POLY_DEGREE == 5
3307 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3308 0.693153073200168932794,
3309 0.240153617044375388211,
3310 0.0558263180532956664775,
3311 0.00898934009049466391101,
3312 0.00187757667519147912699
3313 #elif EXP_POLY_DEGREE == 4
3314 1.00000259337069434683,
3315 0.693003834469974940458,
3316 0.24144275689150793076,
3317 0.0520114606103070150235,
3318 0.0135341679161270268764
3319 #elif EXP_POLY_DEGREE == 3
3320 0.999925218562710312959,
3321 0.695833540494823811697,
3322 0.226067155427249155588,
3323 0.0780245226406372992967
3324 #elif EXP_POLY_DEGREE == 2
3325 1.00172476321474503578,
3326 0.657636275736077639316,
3327 0.33718943461968720704
3328 #else
3329 #error
3330 #endif
3331 };
3332
3333
3334 LLVMValueRef
3335 lp_build_exp2(struct lp_build_context *bld,
3336 LLVMValueRef x)
3337 {
3338 LLVMBuilderRef builder = bld->gallivm->builder;
3339 const struct lp_type type = bld->type;
3340 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3341 LLVMValueRef ipart = NULL;
3342 LLVMValueRef fpart = NULL;
3343 LLVMValueRef expipart = NULL;
3344 LLVMValueRef expfpart = NULL;
3345 LLVMValueRef res = NULL;
3346
3347 assert(lp_check_value(bld->type, x));
3348
3349 /* TODO: optimize the constant case */
3350 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3351 LLVMIsConstant(x)) {
3352 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3353 __FUNCTION__);
3354 }
3355
3356 assert(type.floating && type.width == 32);
3357
3358 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3359 * the result is INF and if it's smaller than -126.9 the result is 0 */
3360 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3361 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3362 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3363 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3364
3365 /* ipart = floor(x) */
3366 /* fpart = x - ipart */
3367 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3368
3369 /* expipart = (float) (1 << ipart) */
3370 expipart = LLVMBuildAdd(builder, ipart,
3371 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3372 expipart = LLVMBuildShl(builder, expipart,
3373 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3374 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3375
3376 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3377 ARRAY_SIZE(lp_build_exp2_polynomial));
3378
3379 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3380
3381 return res;
3382 }
3383
3384
3385
3386 /**
3387 * Extract the exponent of a IEEE-754 floating point value.
3388 *
3389 * Optionally apply an integer bias.
3390 *
3391 * Result is an integer value with
3392 *
3393 * ifloor(log2(x)) + bias
3394 */
3395 LLVMValueRef
3396 lp_build_extract_exponent(struct lp_build_context *bld,
3397 LLVMValueRef x,
3398 int bias)
3399 {
3400 LLVMBuilderRef builder = bld->gallivm->builder;
3401 const struct lp_type type = bld->type;
3402 unsigned mantissa = lp_mantissa(type);
3403 LLVMValueRef res;
3404
3405 assert(type.floating);
3406
3407 assert(lp_check_value(bld->type, x));
3408
3409 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3410
3411 res = LLVMBuildLShr(builder, x,
3412 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3413 res = LLVMBuildAnd(builder, res,
3414 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3415 res = LLVMBuildSub(builder, res,
3416 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3417
3418 return res;
3419 }
3420
3421
3422 /**
3423 * Extract the mantissa of the a floating.
3424 *
3425 * Result is a floating point value with
3426 *
3427 * x / floor(log2(x))
3428 */
3429 LLVMValueRef
3430 lp_build_extract_mantissa(struct lp_build_context *bld,
3431 LLVMValueRef x)
3432 {
3433 LLVMBuilderRef builder = bld->gallivm->builder;
3434 const struct lp_type type = bld->type;
3435 unsigned mantissa = lp_mantissa(type);
3436 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3437 (1ULL << mantissa) - 1);
3438 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3439 LLVMValueRef res;
3440
3441 assert(lp_check_value(bld->type, x));
3442
3443 assert(type.floating);
3444
3445 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3446
3447 /* res = x / 2**ipart */
3448 res = LLVMBuildAnd(builder, x, mantmask, "");
3449 res = LLVMBuildOr(builder, res, one, "");
3450 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3451
3452 return res;
3453 }
3454
3455
3456
3457 /**
3458 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3459 * These coefficients can be generate with
3460 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3461 */
3462 const double lp_build_log2_polynomial[] = {
3463 #if LOG_POLY_DEGREE == 5
3464 2.88539008148777786488L,
3465 0.961796878841293367824L,
3466 0.577058946784739859012L,
3467 0.412914355135828735411L,
3468 0.308591899232910175289L,
3469 0.352376952300281371868L,
3470 #elif LOG_POLY_DEGREE == 4
3471 2.88539009343309178325L,
3472 0.961791550404184197881L,
3473 0.577440339438736392009L,
3474 0.403343858251329912514L,
3475 0.406718052498846252698L,
3476 #elif LOG_POLY_DEGREE == 3
3477 2.88538959748872753838L,
3478 0.961932915889597772928L,
3479 0.571118517972136195241L,
3480 0.493997535084709500285L,
3481 #else
3482 #error
3483 #endif
3484 };
3485
3486 /**
3487 * See http://www.devmaster.net/forums/showthread.php?p=43580
3488 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3489 * http://www.nezumi.demon.co.uk/consult/logx.htm
3490 *
3491 * If handle_edge_cases is true the function will perform computations
3492 * to match the required D3D10+ behavior for each of the edge cases.
3493 * That means that if input is:
3494 * - less than zero (to and including -inf) then NaN will be returned
3495 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3496 * - +infinity, then +infinity will be returned
3497 * - NaN, then NaN will be returned
3498 *
3499 * Those checks are fairly expensive so if you don't need them make sure
3500 * handle_edge_cases is false.
3501 */
3502 void
3503 lp_build_log2_approx(struct lp_build_context *bld,
3504 LLVMValueRef x,
3505 LLVMValueRef *p_exp,
3506 LLVMValueRef *p_floor_log2,
3507 LLVMValueRef *p_log2,
3508 boolean handle_edge_cases)
3509 {
3510 LLVMBuilderRef builder = bld->gallivm->builder;
3511 const struct lp_type type = bld->type;
3512 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3513 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3514
3515 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3516 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3517 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3518
3519 LLVMValueRef i = NULL;
3520 LLVMValueRef y = NULL;
3521 LLVMValueRef z = NULL;
3522 LLVMValueRef exp = NULL;
3523 LLVMValueRef mant = NULL;
3524 LLVMValueRef logexp = NULL;
3525 LLVMValueRef p_z = NULL;
3526 LLVMValueRef res = NULL;
3527
3528 assert(lp_check_value(bld->type, x));
3529
3530 if(p_exp || p_floor_log2 || p_log2) {
3531 /* TODO: optimize the constant case */
3532 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3533 LLVMIsConstant(x)) {
3534 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3535 __FUNCTION__);
3536 }
3537
3538 assert(type.floating && type.width == 32);
3539
3540 /*
3541 * We don't explicitly handle denormalized numbers. They will yield a
3542 * result in the neighbourhood of -127, which appears to be adequate
3543 * enough.
3544 */
3545
3546 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3547
3548 /* exp = (float) exponent(x) */
3549 exp = LLVMBuildAnd(builder, i, expmask, "");
3550 }
3551
3552 if(p_floor_log2 || p_log2) {
3553 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3554 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3555 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3556 }
3557
3558 if (p_log2) {
3559 /* mant = 1 + (float) mantissa(x) */
3560 mant = LLVMBuildAnd(builder, i, mantmask, "");
3561 mant = LLVMBuildOr(builder, mant, one, "");
3562 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3563
3564 /* y = (mant - 1) / (mant + 1) */
3565 y = lp_build_div(bld,
3566 lp_build_sub(bld, mant, bld->one),
3567 lp_build_add(bld, mant, bld->one)
3568 );
3569
3570 /* z = y^2 */
3571 z = lp_build_mul(bld, y, y);
3572
3573 /* compute P(z) */
3574 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3575 ARRAY_SIZE(lp_build_log2_polynomial));
3576
3577 /* y * P(z) + logexp */
3578 res = lp_build_mad(bld, y, p_z, logexp);
3579
3580 if (type.floating && handle_edge_cases) {
3581 LLVMValueRef negmask, infmask, zmask;
3582 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3583 lp_build_const_vec(bld->gallivm, type, 0.0f));
3584 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3585 lp_build_const_vec(bld->gallivm, type, 0.0f));
3586 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3587 lp_build_const_vec(bld->gallivm, type, INFINITY));
3588
3589 /* If x is qual to inf make sure we return inf */
3590 res = lp_build_select(bld, infmask,
3591 lp_build_const_vec(bld->gallivm, type, INFINITY),
3592 res);
3593 /* If x is qual to 0, return -inf */
3594 res = lp_build_select(bld, zmask,
3595 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3596 res);
3597 /* If x is nan or less than 0, return nan */
3598 res = lp_build_select(bld, negmask,
3599 lp_build_const_vec(bld->gallivm, type, NAN),
3600 res);
3601 }
3602 }
3603
3604 if (p_exp) {
3605 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3606 *p_exp = exp;
3607 }
3608
3609 if (p_floor_log2)
3610 *p_floor_log2 = logexp;
3611
3612 if (p_log2)
3613 *p_log2 = res;
3614 }
3615
3616
3617 /*
3618 * log2 implementation which doesn't have special code to
3619 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3620 * the results for those cases are undefined.
3621 */
3622 LLVMValueRef
3623 lp_build_log2(struct lp_build_context *bld,
3624 LLVMValueRef x)
3625 {
3626 LLVMValueRef res;
3627 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3628 return res;
3629 }
3630
3631 /*
3632 * Version of log2 which handles all edge cases.
3633 * Look at documentation of lp_build_log2_approx for
3634 * description of the behavior for each of the edge cases.
3635 */
3636 LLVMValueRef
3637 lp_build_log2_safe(struct lp_build_context *bld,
3638 LLVMValueRef x)
3639 {
3640 LLVMValueRef res;
3641 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3642 return res;
3643 }
3644
3645
3646 /**
3647 * Faster (and less accurate) log2.
3648 *
3649 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3650 *
3651 * Piece-wise linear approximation, with exact results when x is a
3652 * power of two.
3653 *
3654 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3655 */
3656 LLVMValueRef
3657 lp_build_fast_log2(struct lp_build_context *bld,
3658 LLVMValueRef x)
3659 {
3660 LLVMBuilderRef builder = bld->gallivm->builder;
3661 LLVMValueRef ipart;
3662 LLVMValueRef fpart;
3663
3664 assert(lp_check_value(bld->type, x));
3665
3666 assert(bld->type.floating);
3667
3668 /* ipart = floor(log2(x)) - 1 */
3669 ipart = lp_build_extract_exponent(bld, x, -1);
3670 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3671
3672 /* fpart = x / 2**ipart */
3673 fpart = lp_build_extract_mantissa(bld, x);
3674
3675 /* ipart + fpart */
3676 return LLVMBuildFAdd(builder, ipart, fpart, "");
3677 }
3678
3679
3680 /**
3681 * Fast implementation of iround(log2(x)).
3682 *
3683 * Not an approximation -- it should give accurate results all the time.
3684 */
3685 LLVMValueRef
3686 lp_build_ilog2(struct lp_build_context *bld,
3687 LLVMValueRef x)
3688 {
3689 LLVMBuilderRef builder = bld->gallivm->builder;
3690 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3691 LLVMValueRef ipart;
3692
3693 assert(bld->type.floating);
3694
3695 assert(lp_check_value(bld->type, x));
3696
3697 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3698 x = LLVMBuildFMul(builder, x, sqrt2, "");
3699
3700 /* ipart = floor(log2(x) + 0.5) */
3701 ipart = lp_build_extract_exponent(bld, x, 0);
3702
3703 return ipart;
3704 }
3705
3706 LLVMValueRef
3707 lp_build_mod(struct lp_build_context *bld,
3708 LLVMValueRef x,
3709 LLVMValueRef y)
3710 {
3711 LLVMBuilderRef builder = bld->gallivm->builder;
3712 LLVMValueRef res;
3713 const struct lp_type type = bld->type;
3714
3715 assert(lp_check_value(type, x));
3716 assert(lp_check_value(type, y));
3717
3718 if (type.floating)
3719 res = LLVMBuildFRem(builder, x, y, "");
3720 else if (type.sign)
3721 res = LLVMBuildSRem(builder, x, y, "");
3722 else
3723 res = LLVMBuildURem(builder, x, y, "");
3724 return res;
3725 }
3726
3727
3728 /*
3729 * For floating inputs it creates and returns a mask
3730 * which is all 1's for channels which are NaN.
3731 * Channels inside x which are not NaN will be 0.
3732 */
3733 LLVMValueRef
3734 lp_build_isnan(struct lp_build_context *bld,
3735 LLVMValueRef x)
3736 {
3737 LLVMValueRef mask;
3738 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3739
3740 assert(bld->type.floating);
3741 assert(lp_check_value(bld->type, x));
3742
3743 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3744 "isnotnan");
3745 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3746 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3747 return mask;
3748 }
3749
3750 /* Returns all 1's for floating point numbers that are
3751 * finite numbers and returns all zeros for -inf,
3752 * inf and nan's */
3753 LLVMValueRef
3754 lp_build_isfinite(struct lp_build_context *bld,
3755 LLVMValueRef x)
3756 {
3757 LLVMBuilderRef builder = bld->gallivm->builder;
3758 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3759 struct lp_type int_type = lp_int_type(bld->type);
3760 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3761 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3762 0x7f800000);
3763
3764 if (!bld->type.floating) {
3765 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3766 }
3767 assert(bld->type.floating);
3768 assert(lp_check_value(bld->type, x));
3769 assert(bld->type.width == 32);
3770
3771 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3772 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3773 intx, infornan32);
3774 }
3775
3776 /*
3777 * Returns true if the number is nan or inf and false otherwise.
3778 * The input has to be a floating point vector.
3779 */
3780 LLVMValueRef
3781 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3782 const struct lp_type type,
3783 LLVMValueRef x)
3784 {
3785 LLVMBuilderRef builder = gallivm->builder;
3786 struct lp_type int_type = lp_int_type(type);
3787 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3788 0x7f800000);
3789 LLVMValueRef ret;
3790
3791 assert(type.floating);
3792
3793 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3794 ret = LLVMBuildAnd(builder, ret, const0, "");
3795 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3796 ret, const0);
3797
3798 return ret;
3799 }
3800
3801
3802 LLVMValueRef
3803 lp_build_fpstate_get(struct gallivm_state *gallivm)
3804 {
3805 if (util_cpu_caps.has_sse) {
3806 LLVMBuilderRef builder = gallivm->builder;
3807 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3808 gallivm,
3809 LLVMInt32TypeInContext(gallivm->context),
3810 "mxcsr_ptr");
3811 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3812 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3813 lp_build_intrinsic(builder,
3814 "llvm.x86.sse.stmxcsr",
3815 LLVMVoidTypeInContext(gallivm->context),
3816 &mxcsr_ptr8, 1, 0);
3817 return mxcsr_ptr;
3818 }
3819 return 0;
3820 }
3821
3822 void
3823 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3824 boolean zero)
3825 {
3826 if (util_cpu_caps.has_sse) {
3827 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3828 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3829
3830 LLVMBuilderRef builder = gallivm->builder;
3831 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3832 LLVMValueRef mxcsr =
3833 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3834
3835 if (util_cpu_caps.has_daz) {
3836 /* Enable denormals are zero mode */
3837 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3838 }
3839 if (zero) {
3840 mxcsr = LLVMBuildOr(builder, mxcsr,
3841 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3842 } else {
3843 mxcsr = LLVMBuildAnd(builder, mxcsr,
3844 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3845 }
3846
3847 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3848 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3849 }
3850 }
3851
3852 void
3853 lp_build_fpstate_set(struct gallivm_state *gallivm,
3854 LLVMValueRef mxcsr_ptr)
3855 {
3856 if (util_cpu_caps.has_sse) {
3857 LLVMBuilderRef builder = gallivm->builder;
3858 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3859 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3860 lp_build_intrinsic(builder,
3861 "llvm.x86.sse.ldmxcsr",
3862 LLVMVoidTypeInContext(gallivm->context),
3863 &mxcsr_ptr, 1, 0);
3864 }
3865 }