cf1958b3b6335779568cde7874f7099235a9c563
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if(a == bld->zero)
545 return b;
546 if(b == bld->zero)
547 return a;
548 if(a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if(bld->type.norm) {
552 const char *intrinsic = NULL;
553
554 if(a == bld->one || b == bld->one)
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if(util_cpu_caps.has_sse2) {
560 if(type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
562 if(type.width == 16)
563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
564 } else if (util_cpu_caps.has_altivec) {
565 if(type.width == 8)
566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
567 if(type.width == 16)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
569 }
570 }
571 if (type.width * type.length == 256) {
572 if(util_cpu_caps.has_avx2) {
573 if(type.width == 8)
574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
575 if(type.width == 16)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
577 }
578 }
579 }
580
581 if (intrinsic)
582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
583 }
584
585 if(type.norm && !type.floating && !type.fixed) {
586 if (type.sign) {
587 uint64_t sign = (uint64_t)1 << (type.width - 1);
588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
590 /* a_clamp_max is the maximum a for positive b,
591 a_clamp_min is the minimum a for negative b. */
592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
595 } else {
596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 }
598 }
599
600 if(LLVMIsConstant(a) && LLVMIsConstant(b))
601 if (type.floating)
602 res = LLVMConstFAdd(a, b);
603 else
604 res = LLVMConstAdd(a, b);
605 else
606 if (type.floating)
607 res = LLVMBuildFAdd(builder, a, b, "");
608 else
609 res = LLVMBuildAdd(builder, a, b, "");
610
611 /* clamp to ceiling of 1.0 */
612 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
614
615 /* XXX clamp to floor of -1 or 0??? */
616
617 return res;
618 }
619
620
621 /** Return the scalar sum of the elements of a.
622 * Should avoid this operation whenever possible.
623 */
624 LLVMValueRef
625 lp_build_horizontal_add(struct lp_build_context *bld,
626 LLVMValueRef a)
627 {
628 LLVMBuilderRef builder = bld->gallivm->builder;
629 const struct lp_type type = bld->type;
630 LLVMValueRef index, res;
631 unsigned i, length;
632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
634 LLVMValueRef vecres, elem2;
635
636 assert(lp_check_value(type, a));
637
638 if (type.length == 1) {
639 return a;
640 }
641
642 assert(!bld->type.norm);
643
644 /*
645 * for byte vectors can do much better with psadbw.
646 * Using repeated shuffle/adds here. Note with multiple vectors
647 * this can be done more efficiently as outlined in the intel
648 * optimization manual.
649 * Note: could cause data rearrangement if used with smaller element
650 * sizes.
651 */
652
653 vecres = a;
654 length = type.length / 2;
655 while (length > 1) {
656 LLVMValueRef vec1, vec2;
657 for (i = 0; i < length; i++) {
658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
660 }
661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
662 LLVMConstVector(shuffles1, length), "");
663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
664 LLVMConstVector(shuffles2, length), "");
665 if (type.floating) {
666 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
667 }
668 else {
669 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
670 }
671 length = length >> 1;
672 }
673
674 /* always have vector of size 2 here */
675 assert(length == 1);
676
677 index = lp_build_const_int32(bld->gallivm, 0);
678 res = LLVMBuildExtractElement(builder, vecres, index, "");
679 index = lp_build_const_int32(bld->gallivm, 1);
680 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
681
682 if (type.floating)
683 res = LLVMBuildFAdd(builder, res, elem2, "");
684 else
685 res = LLVMBuildAdd(builder, res, elem2, "");
686
687 return res;
688 }
689
690 /**
691 * Return the horizontal sums of 4 float vectors as a float4 vector.
692 * This uses the technique as outlined in Intel Optimization Manual.
693 */
694 static LLVMValueRef
695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
696 LLVMValueRef src[4])
697 {
698 struct gallivm_state *gallivm = bld->gallivm;
699 LLVMBuilderRef builder = gallivm->builder;
700 LLVMValueRef shuffles[4];
701 LLVMValueRef tmp[4];
702 LLVMValueRef sumtmp[2], shuftmp[2];
703
704 /* lower half of regs */
705 shuffles[0] = lp_build_const_int32(gallivm, 0);
706 shuffles[1] = lp_build_const_int32(gallivm, 1);
707 shuffles[2] = lp_build_const_int32(gallivm, 4);
708 shuffles[3] = lp_build_const_int32(gallivm, 5);
709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
710 LLVMConstVector(shuffles, 4), "");
711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
712 LLVMConstVector(shuffles, 4), "");
713
714 /* upper half of regs */
715 shuffles[0] = lp_build_const_int32(gallivm, 2);
716 shuffles[1] = lp_build_const_int32(gallivm, 3);
717 shuffles[2] = lp_build_const_int32(gallivm, 6);
718 shuffles[3] = lp_build_const_int32(gallivm, 7);
719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
720 LLVMConstVector(shuffles, 4), "");
721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
722 LLVMConstVector(shuffles, 4), "");
723
724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
726
727 shuffles[0] = lp_build_const_int32(gallivm, 0);
728 shuffles[1] = lp_build_const_int32(gallivm, 2);
729 shuffles[2] = lp_build_const_int32(gallivm, 4);
730 shuffles[3] = lp_build_const_int32(gallivm, 6);
731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
732 LLVMConstVector(shuffles, 4), "");
733
734 shuffles[0] = lp_build_const_int32(gallivm, 1);
735 shuffles[1] = lp_build_const_int32(gallivm, 3);
736 shuffles[2] = lp_build_const_int32(gallivm, 5);
737 shuffles[3] = lp_build_const_int32(gallivm, 7);
738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
739 LLVMConstVector(shuffles, 4), "");
740
741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
742 }
743
744
745 /*
746 * partially horizontally add 2-4 float vectors with length nx4,
747 * i.e. only four adjacent values in each vector will be added,
748 * assuming values are really grouped in 4 which also determines
749 * output order.
750 *
751 * Return a vector of the same length as the initial vectors,
752 * with the excess elements (if any) being undefined.
753 * The element order is independent of number of input vectors.
754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
755 * the output order thus will be
756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
757 */
758 LLVMValueRef
759 lp_build_hadd_partial4(struct lp_build_context *bld,
760 LLVMValueRef vectors[],
761 unsigned num_vecs)
762 {
763 struct gallivm_state *gallivm = bld->gallivm;
764 LLVMBuilderRef builder = gallivm->builder;
765 LLVMValueRef ret_vec;
766 LLVMValueRef tmp[4];
767 const char *intrinsic = NULL;
768
769 assert(num_vecs >= 2 && num_vecs <= 4);
770 assert(bld->type.floating);
771
772 /* only use this with at least 2 vectors, as it is sort of expensive
773 * (depending on cpu) and we always need two horizontal adds anyway,
774 * so a shuffle/add approach might be better.
775 */
776
777 tmp[0] = vectors[0];
778 tmp[1] = vectors[1];
779
780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
782
783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
784 bld->type.length == 4) {
785 intrinsic = "llvm.x86.sse3.hadd.ps";
786 }
787 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
788 bld->type.length == 8) {
789 intrinsic = "llvm.x86.avx.hadd.ps.256";
790 }
791 if (intrinsic) {
792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
793 lp_build_vec_type(gallivm, bld->type),
794 tmp[0], tmp[1]);
795 if (num_vecs > 2) {
796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
797 lp_build_vec_type(gallivm, bld->type),
798 tmp[2], tmp[3]);
799 }
800 else {
801 tmp[1] = tmp[0];
802 }
803 return lp_build_intrinsic_binary(builder, intrinsic,
804 lp_build_vec_type(gallivm, bld->type),
805 tmp[0], tmp[1]);
806 }
807
808 if (bld->type.length == 4) {
809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
810 }
811 else {
812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
813 unsigned j;
814 unsigned num_iter = bld->type.length / 4;
815 struct lp_type parttype = bld->type;
816 parttype.length = 4;
817 for (j = 0; j < num_iter; j++) {
818 LLVMValueRef partsrc[4];
819 unsigned i;
820 for (i = 0; i < 4; i++) {
821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
822 }
823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
824 }
825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
826 }
827 return ret_vec;
828 }
829
830 /**
831 * Generate a - b
832 */
833 LLVMValueRef
834 lp_build_sub(struct lp_build_context *bld,
835 LLVMValueRef a,
836 LLVMValueRef b)
837 {
838 LLVMBuilderRef builder = bld->gallivm->builder;
839 const struct lp_type type = bld->type;
840 LLVMValueRef res;
841
842 assert(lp_check_value(type, a));
843 assert(lp_check_value(type, b));
844
845 if(b == bld->zero)
846 return a;
847 if(a == bld->undef || b == bld->undef)
848 return bld->undef;
849 if(a == b)
850 return bld->zero;
851
852 if(bld->type.norm) {
853 const char *intrinsic = NULL;
854
855 if(b == bld->one)
856 return bld->zero;
857
858 if (!type.floating && !type.fixed) {
859 if (type.width * type.length == 128) {
860 if (util_cpu_caps.has_sse2) {
861 if(type.width == 8)
862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
863 if(type.width == 16)
864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
865 } else if (util_cpu_caps.has_altivec) {
866 if(type.width == 8)
867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
868 if(type.width == 16)
869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
870 }
871 }
872 if (type.width * type.length == 256) {
873 if (util_cpu_caps.has_avx2) {
874 if(type.width == 8)
875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
876 if(type.width == 16)
877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
878 }
879 }
880 }
881
882 if (intrinsic)
883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
884 }
885
886 if(type.norm && !type.floating && !type.fixed) {
887 if (type.sign) {
888 uint64_t sign = (uint64_t)1 << (type.width - 1);
889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
891 /* a_clamp_max is the maximum a for negative b,
892 a_clamp_min is the minimum a for positive b. */
893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
896 } else {
897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
898 }
899 }
900
901 if(LLVMIsConstant(a) && LLVMIsConstant(b))
902 if (type.floating)
903 res = LLVMConstFSub(a, b);
904 else
905 res = LLVMConstSub(a, b);
906 else
907 if (type.floating)
908 res = LLVMBuildFSub(builder, a, b, "");
909 else
910 res = LLVMBuildSub(builder, a, b, "");
911
912 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
914
915 return res;
916 }
917
918
919
920 /**
921 * Normalized multiplication.
922 *
923 * There are several approaches for (using 8-bit normalized multiplication as
924 * an example):
925 *
926 * - alpha plus one
927 *
928 * makes the following approximation to the division (Sree)
929 *
930 * a*b/255 ~= (a*(b + 1)) >> 256
931 *
932 * which is the fastest method that satisfies the following OpenGL criteria of
933 *
934 * 0*0 = 0 and 255*255 = 255
935 *
936 * - geometric series
937 *
938 * takes the geometric series approximation to the division
939 *
940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
941 *
942 * in this case just the first two terms to fit in 16bit arithmetic
943 *
944 * t/255 ~= (t + (t >> 8)) >> 8
945 *
946 * note that just by itself it doesn't satisfies the OpenGL criteria, as
947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
948 * must be used.
949 *
950 * - geometric series plus rounding
951 *
952 * when using a geometric series division instead of truncating the result
953 * use roundoff in the approximation (Jim Blinn)
954 *
955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
956 *
957 * achieving the exact results.
958 *
959 *
960 *
961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
963 * @sa Michael Herf, The "double blend trick", May 2000,
964 * http://www.stereopsis.com/doubleblend.html
965 */
966 static LLVMValueRef
967 lp_build_mul_norm(struct gallivm_state *gallivm,
968 struct lp_type wide_type,
969 LLVMValueRef a, LLVMValueRef b)
970 {
971 LLVMBuilderRef builder = gallivm->builder;
972 struct lp_build_context bld;
973 unsigned n;
974 LLVMValueRef half;
975 LLVMValueRef ab;
976
977 assert(!wide_type.floating);
978 assert(lp_check_value(wide_type, a));
979 assert(lp_check_value(wide_type, b));
980
981 lp_build_context_init(&bld, gallivm, wide_type);
982
983 n = wide_type.width / 2;
984 if (wide_type.sign) {
985 --n;
986 }
987
988 /*
989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
991 */
992
993 /*
994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
995 */
996
997 ab = LLVMBuildMul(builder, a, b, "");
998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
999
1000 /*
1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002 */
1003
1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005 if (wide_type.sign) {
1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008 half = lp_build_select(&bld, sign, minus_half, half);
1009 }
1010 ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012 /* Final division */
1013 ab = lp_build_shr_imm(&bld, ab, n);
1014
1015 return ab;
1016 }
1017
1018 /**
1019 * Generate a * b
1020 */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023 LLVMValueRef a,
1024 LLVMValueRef b)
1025 {
1026 LLVMBuilderRef builder = bld->gallivm->builder;
1027 const struct lp_type type = bld->type;
1028 LLVMValueRef shift;
1029 LLVMValueRef res;
1030
1031 assert(lp_check_value(type, a));
1032 assert(lp_check_value(type, b));
1033
1034 if(a == bld->zero)
1035 return bld->zero;
1036 if(a == bld->one)
1037 return b;
1038 if(b == bld->zero)
1039 return bld->zero;
1040 if(b == bld->one)
1041 return a;
1042 if(a == bld->undef || b == bld->undef)
1043 return bld->undef;
1044
1045 if (!type.floating && !type.fixed && type.norm) {
1046 struct lp_type wide_type = lp_wider_type(type);
1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052 /* PMULLW, PSRLW, PADDW */
1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058 return ab;
1059 }
1060
1061 if(type.fixed)
1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063 else
1064 shift = NULL;
1065
1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067 if (type.floating)
1068 res = LLVMConstFMul(a, b);
1069 else
1070 res = LLVMConstMul(a, b);
1071 if(shift) {
1072 if(type.sign)
1073 res = LLVMConstAShr(res, shift);
1074 else
1075 res = LLVMConstLShr(res, shift);
1076 }
1077 }
1078 else {
1079 if (type.floating)
1080 res = LLVMBuildFMul(builder, a, b, "");
1081 else
1082 res = LLVMBuildMul(builder, a, b, "");
1083 if(shift) {
1084 if(type.sign)
1085 res = LLVMBuildAShr(builder, res, shift, "");
1086 else
1087 res = LLVMBuildLShr(builder, res, shift, "");
1088 }
1089 }
1090
1091 return res;
1092 }
1093
1094 /*
1095 * Widening mul, valid for 32x32 bit -> 64bit only.
1096 * Result is low 32bits, high bits returned in res_hi.
1097 *
1098 * Emits code that is meant to be compiled for the host CPU.
1099 */
1100 LLVMValueRef
1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102 LLVMValueRef a,
1103 LLVMValueRef b,
1104 LLVMValueRef *res_hi)
1105 {
1106 struct gallivm_state *gallivm = bld->gallivm;
1107 LLVMBuilderRef builder = gallivm->builder;
1108
1109 assert(bld->type.width == 32);
1110 assert(bld->type.floating == 0);
1111 assert(bld->type.fixed == 0);
1112 assert(bld->type.norm == 0);
1113
1114 /*
1115 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116 * for x86 simd is atrocious (even if the high bits weren't required),
1117 * trying to handle real 64bit inputs (which of course can't happen due
1118 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119 * apparently llvm does not recognize this widening mul). This includes 6
1120 * (instead of 2) pmuludq plus extra adds and shifts
1121 * The same story applies to signed mul, albeit fixing this requires sse41.
1122 * https://llvm.org/bugs/show_bug.cgi?id=30845
1123 * So, whip up our own code, albeit only for length 4 and 8 (which
1124 * should be good enough)...
1125 */
1126 if ((bld->type.length == 4 || bld->type.length == 8) &&
1127 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128 util_cpu_caps.has_sse4_1)) {
1129 const char *intrinsic = NULL;
1130 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132 struct lp_type type_wide = lp_wider_type(bld->type);
1133 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134 unsigned i;
1135 for (i = 0; i < bld->type.length; i += 2) {
1136 shuf[i] = lp_build_const_int32(gallivm, i+1);
1137 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138 }
1139 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140 aeven = a;
1141 beven = b;
1142 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146 if (bld->type.sign) {
1147 intrinsic = "llvm.x86.avx2.pmul.dq";
1148 } else {
1149 intrinsic = "llvm.x86.avx2.pmulu.dq";
1150 }
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 else {
1157 /* for consistent naming look elsewhere... */
1158 if (bld->type.sign) {
1159 intrinsic = "llvm.x86.sse41.pmuldq";
1160 } else {
1161 intrinsic = "llvm.x86.sse2.pmulu.dq";
1162 }
1163 /*
1164 * XXX If we only have AVX but not AVX2 this is a pain.
1165 * lp_build_intrinsic_binary_anylength() can't handle it
1166 * (due to src and dst type not being identical).
1167 */
1168 if (bld->type.length == 8) {
1169 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171 LLVMValueRef muleven2[2], mulodd2[2];
1172 struct lp_type type_wide_half = type_wide;
1173 LLVMTypeRef wtype_half;
1174 type_wide_half.length = 2;
1175 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185 wtype_half, aevenlo, bevenlo);
1186 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187 wtype_half, aoddlo, boddlo);
1188 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189 wtype_half, aevenhi, bevenhi);
1190 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191 wtype_half, aoddhi, boddhi);
1192 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195 }
1196 else {
1197 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198 wider_type, aeven, beven);
1199 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200 wider_type, aodd, bodd);
1201 }
1202 }
1203 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206 for (i = 0; i < bld->type.length; i += 2) {
1207 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209 }
1210 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213 for (i = 0; i < bld->type.length; i += 2) {
1214 shuf[i] = lp_build_const_int32(gallivm, i);
1215 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216 }
1217 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219 }
1220 else {
1221 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222 }
1223 }
1224
1225
1226 /*
1227 * Widening mul, valid for 32x32 bit -> 64bit only.
1228 * Result is low 32bits, high bits returned in res_hi.
1229 *
1230 * Emits generic code.
1231 */
1232 LLVMValueRef
1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234 LLVMValueRef a,
1235 LLVMValueRef b,
1236 LLVMValueRef *res_hi)
1237 {
1238 struct gallivm_state *gallivm = bld->gallivm;
1239 LLVMBuilderRef builder = gallivm->builder;
1240 LLVMValueRef tmp, shift, res_lo;
1241 struct lp_type type_tmp;
1242 LLVMTypeRef wide_type, narrow_type;
1243
1244 type_tmp = bld->type;
1245 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1246 type_tmp.width *= 2;
1247 wide_type = lp_build_vec_type(gallivm, type_tmp);
1248 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1249
1250 if (bld->type.sign) {
1251 a = LLVMBuildSExt(builder, a, wide_type, "");
1252 b = LLVMBuildSExt(builder, b, wide_type, "");
1253 } else {
1254 a = LLVMBuildZExt(builder, a, wide_type, "");
1255 b = LLVMBuildZExt(builder, b, wide_type, "");
1256 }
1257 tmp = LLVMBuildMul(builder, a, b, "");
1258
1259 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1260
1261 /* Since we truncate anyway, LShr and AShr are equivalent. */
1262 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1263 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1264
1265 return res_lo;
1266 }
1267
1268
1269 /* a * b + c */
1270 LLVMValueRef
1271 lp_build_mad(struct lp_build_context *bld,
1272 LLVMValueRef a,
1273 LLVMValueRef b,
1274 LLVMValueRef c)
1275 {
1276 const struct lp_type type = bld->type;
1277 if (type.floating) {
1278 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1279 } else {
1280 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1281 }
1282 }
1283
1284
1285 /**
1286 * Small vector x scale multiplication optimization.
1287 */
1288 LLVMValueRef
1289 lp_build_mul_imm(struct lp_build_context *bld,
1290 LLVMValueRef a,
1291 int b)
1292 {
1293 LLVMBuilderRef builder = bld->gallivm->builder;
1294 LLVMValueRef factor;
1295
1296 assert(lp_check_value(bld->type, a));
1297
1298 if(b == 0)
1299 return bld->zero;
1300
1301 if(b == 1)
1302 return a;
1303
1304 if(b == -1)
1305 return lp_build_negate(bld, a);
1306
1307 if(b == 2 && bld->type.floating)
1308 return lp_build_add(bld, a, a);
1309
1310 if(util_is_power_of_two(b)) {
1311 unsigned shift = ffs(b) - 1;
1312
1313 if(bld->type.floating) {
1314 #if 0
1315 /*
1316 * Power of two multiplication by directly manipulating the exponent.
1317 *
1318 * XXX: This might not be always faster, it will introduce a small error
1319 * for multiplication by zero, and it will produce wrong results
1320 * for Inf and NaN.
1321 */
1322 unsigned mantissa = lp_mantissa(bld->type);
1323 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1324 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1325 a = LLVMBuildAdd(builder, a, factor, "");
1326 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1327 return a;
1328 #endif
1329 }
1330 else {
1331 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1332 return LLVMBuildShl(builder, a, factor, "");
1333 }
1334 }
1335
1336 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1337 return lp_build_mul(bld, a, factor);
1338 }
1339
1340
1341 /**
1342 * Generate a / b
1343 */
1344 LLVMValueRef
1345 lp_build_div(struct lp_build_context *bld,
1346 LLVMValueRef a,
1347 LLVMValueRef b)
1348 {
1349 LLVMBuilderRef builder = bld->gallivm->builder;
1350 const struct lp_type type = bld->type;
1351
1352 assert(lp_check_value(type, a));
1353 assert(lp_check_value(type, b));
1354
1355 if(a == bld->zero)
1356 return bld->zero;
1357 if(a == bld->one && type.floating)
1358 return lp_build_rcp(bld, b);
1359 if(b == bld->zero)
1360 return bld->undef;
1361 if(b == bld->one)
1362 return a;
1363 if(a == bld->undef || b == bld->undef)
1364 return bld->undef;
1365
1366 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1367 if (type.floating)
1368 return LLVMConstFDiv(a, b);
1369 else if (type.sign)
1370 return LLVMConstSDiv(a, b);
1371 else
1372 return LLVMConstUDiv(a, b);
1373 }
1374
1375 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1376 if(FALSE &&
1377 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1378 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1379 type.floating)
1380 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1381
1382 if (type.floating)
1383 return LLVMBuildFDiv(builder, a, b, "");
1384 else if (type.sign)
1385 return LLVMBuildSDiv(builder, a, b, "");
1386 else
1387 return LLVMBuildUDiv(builder, a, b, "");
1388 }
1389
1390
1391 /**
1392 * Linear interpolation helper.
1393 *
1394 * @param normalized whether we are interpolating normalized values,
1395 * encoded in normalized integers, twice as wide.
1396 *
1397 * @sa http://www.stereopsis.com/doubleblend.html
1398 */
1399 static inline LLVMValueRef
1400 lp_build_lerp_simple(struct lp_build_context *bld,
1401 LLVMValueRef x,
1402 LLVMValueRef v0,
1403 LLVMValueRef v1,
1404 unsigned flags)
1405 {
1406 unsigned half_width = bld->type.width/2;
1407 LLVMBuilderRef builder = bld->gallivm->builder;
1408 LLVMValueRef delta;
1409 LLVMValueRef res;
1410
1411 assert(lp_check_value(bld->type, x));
1412 assert(lp_check_value(bld->type, v0));
1413 assert(lp_check_value(bld->type, v1));
1414
1415 delta = lp_build_sub(bld, v1, v0);
1416
1417 if (bld->type.floating) {
1418 assert(flags == 0);
1419 return lp_build_mad(bld, x, delta, v0);
1420 }
1421
1422 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1423 if (!bld->type.sign) {
1424 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1425 /*
1426 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1427 * most-significant-bit to the lowest-significant-bit, so that
1428 * later we can just divide by 2**n instead of 2**n - 1.
1429 */
1430
1431 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1432 }
1433
1434 /* (x * delta) >> n */
1435 res = lp_build_mul(bld, x, delta);
1436 res = lp_build_shr_imm(bld, res, half_width);
1437 } else {
1438 /*
1439 * The rescaling trick above doesn't work for signed numbers, so
1440 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1441 * instead.
1442 */
1443 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1444 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1445 }
1446 } else {
1447 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1448 res = lp_build_mul(bld, x, delta);
1449 }
1450
1451 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1452 /*
1453 * At this point both res and v0 only use the lower half of the bits,
1454 * the rest is zero. Instead of add / mask, do add with half wide type.
1455 */
1456 struct lp_type narrow_type;
1457 struct lp_build_context narrow_bld;
1458
1459 memset(&narrow_type, 0, sizeof narrow_type);
1460 narrow_type.sign = bld->type.sign;
1461 narrow_type.width = bld->type.width/2;
1462 narrow_type.length = bld->type.length*2;
1463
1464 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1465 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1466 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1467 res = lp_build_add(&narrow_bld, v0, res);
1468 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1469 } else {
1470 res = lp_build_add(bld, v0, res);
1471
1472 if (bld->type.fixed) {
1473 /*
1474 * We need to mask out the high order bits when lerping 8bit
1475 * normalized colors stored on 16bits
1476 */
1477 /* XXX: This step is necessary for lerping 8bit colors stored on
1478 * 16bits, but it will be wrong for true fixed point use cases.
1479 * Basically we need a more powerful lp_type, capable of further
1480 * distinguishing the values interpretation from the value storage.
1481 */
1482 LLVMValueRef low_bits;
1483 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1484 res = LLVMBuildAnd(builder, res, low_bits, "");
1485 }
1486 }
1487
1488 return res;
1489 }
1490
1491
1492 /**
1493 * Linear interpolation.
1494 */
1495 LLVMValueRef
1496 lp_build_lerp(struct lp_build_context *bld,
1497 LLVMValueRef x,
1498 LLVMValueRef v0,
1499 LLVMValueRef v1,
1500 unsigned flags)
1501 {
1502 const struct lp_type type = bld->type;
1503 LLVMValueRef res;
1504
1505 assert(lp_check_value(type, x));
1506 assert(lp_check_value(type, v0));
1507 assert(lp_check_value(type, v1));
1508
1509 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1510
1511 if (type.norm) {
1512 struct lp_type wide_type;
1513 struct lp_build_context wide_bld;
1514 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1515
1516 assert(type.length >= 2);
1517
1518 /*
1519 * Create a wider integer type, enough to hold the
1520 * intermediate result of the multiplication.
1521 */
1522 memset(&wide_type, 0, sizeof wide_type);
1523 wide_type.sign = type.sign;
1524 wide_type.width = type.width*2;
1525 wide_type.length = type.length/2;
1526
1527 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1528
1529 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1530 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1531 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1532
1533 /*
1534 * Lerp both halves.
1535 */
1536
1537 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1538
1539 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1540 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1541
1542 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1543 } else {
1544 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1545 }
1546
1547 return res;
1548 }
1549
1550
1551 /**
1552 * Bilinear interpolation.
1553 *
1554 * Values indices are in v_{yx}.
1555 */
1556 LLVMValueRef
1557 lp_build_lerp_2d(struct lp_build_context *bld,
1558 LLVMValueRef x,
1559 LLVMValueRef y,
1560 LLVMValueRef v00,
1561 LLVMValueRef v01,
1562 LLVMValueRef v10,
1563 LLVMValueRef v11,
1564 unsigned flags)
1565 {
1566 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1567 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1568 return lp_build_lerp(bld, y, v0, v1, flags);
1569 }
1570
1571
1572 LLVMValueRef
1573 lp_build_lerp_3d(struct lp_build_context *bld,
1574 LLVMValueRef x,
1575 LLVMValueRef y,
1576 LLVMValueRef z,
1577 LLVMValueRef v000,
1578 LLVMValueRef v001,
1579 LLVMValueRef v010,
1580 LLVMValueRef v011,
1581 LLVMValueRef v100,
1582 LLVMValueRef v101,
1583 LLVMValueRef v110,
1584 LLVMValueRef v111,
1585 unsigned flags)
1586 {
1587 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1588 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1589 return lp_build_lerp(bld, z, v0, v1, flags);
1590 }
1591
1592
1593 /**
1594 * Generate min(a, b)
1595 * Do checks for special cases but not for nans.
1596 */
1597 LLVMValueRef
1598 lp_build_min(struct lp_build_context *bld,
1599 LLVMValueRef a,
1600 LLVMValueRef b)
1601 {
1602 assert(lp_check_value(bld->type, a));
1603 assert(lp_check_value(bld->type, b));
1604
1605 if(a == bld->undef || b == bld->undef)
1606 return bld->undef;
1607
1608 if(a == b)
1609 return a;
1610
1611 if (bld->type.norm) {
1612 if (!bld->type.sign) {
1613 if (a == bld->zero || b == bld->zero) {
1614 return bld->zero;
1615 }
1616 }
1617 if(a == bld->one)
1618 return b;
1619 if(b == bld->one)
1620 return a;
1621 }
1622
1623 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1624 }
1625
1626
1627 /**
1628 * Generate min(a, b)
1629 * NaN's are handled according to the behavior specified by the
1630 * nan_behavior argument.
1631 */
1632 LLVMValueRef
1633 lp_build_min_ext(struct lp_build_context *bld,
1634 LLVMValueRef a,
1635 LLVMValueRef b,
1636 enum gallivm_nan_behavior nan_behavior)
1637 {
1638 assert(lp_check_value(bld->type, a));
1639 assert(lp_check_value(bld->type, b));
1640
1641 if(a == bld->undef || b == bld->undef)
1642 return bld->undef;
1643
1644 if(a == b)
1645 return a;
1646
1647 if (bld->type.norm) {
1648 if (!bld->type.sign) {
1649 if (a == bld->zero || b == bld->zero) {
1650 return bld->zero;
1651 }
1652 }
1653 if(a == bld->one)
1654 return b;
1655 if(b == bld->one)
1656 return a;
1657 }
1658
1659 return lp_build_min_simple(bld, a, b, nan_behavior);
1660 }
1661
1662 /**
1663 * Generate max(a, b)
1664 * Do checks for special cases, but NaN behavior is undefined.
1665 */
1666 LLVMValueRef
1667 lp_build_max(struct lp_build_context *bld,
1668 LLVMValueRef a,
1669 LLVMValueRef b)
1670 {
1671 assert(lp_check_value(bld->type, a));
1672 assert(lp_check_value(bld->type, b));
1673
1674 if(a == bld->undef || b == bld->undef)
1675 return bld->undef;
1676
1677 if(a == b)
1678 return a;
1679
1680 if(bld->type.norm) {
1681 if(a == bld->one || b == bld->one)
1682 return bld->one;
1683 if (!bld->type.sign) {
1684 if (a == bld->zero) {
1685 return b;
1686 }
1687 if (b == bld->zero) {
1688 return a;
1689 }
1690 }
1691 }
1692
1693 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1694 }
1695
1696
1697 /**
1698 * Generate max(a, b)
1699 * Checks for special cases.
1700 * NaN's are handled according to the behavior specified by the
1701 * nan_behavior argument.
1702 */
1703 LLVMValueRef
1704 lp_build_max_ext(struct lp_build_context *bld,
1705 LLVMValueRef a,
1706 LLVMValueRef b,
1707 enum gallivm_nan_behavior nan_behavior)
1708 {
1709 assert(lp_check_value(bld->type, a));
1710 assert(lp_check_value(bld->type, b));
1711
1712 if(a == bld->undef || b == bld->undef)
1713 return bld->undef;
1714
1715 if(a == b)
1716 return a;
1717
1718 if(bld->type.norm) {
1719 if(a == bld->one || b == bld->one)
1720 return bld->one;
1721 if (!bld->type.sign) {
1722 if (a == bld->zero) {
1723 return b;
1724 }
1725 if (b == bld->zero) {
1726 return a;
1727 }
1728 }
1729 }
1730
1731 return lp_build_max_simple(bld, a, b, nan_behavior);
1732 }
1733
1734 /**
1735 * Generate clamp(a, min, max)
1736 * NaN behavior (for any of a, min, max) is undefined.
1737 * Do checks for special cases.
1738 */
1739 LLVMValueRef
1740 lp_build_clamp(struct lp_build_context *bld,
1741 LLVMValueRef a,
1742 LLVMValueRef min,
1743 LLVMValueRef max)
1744 {
1745 assert(lp_check_value(bld->type, a));
1746 assert(lp_check_value(bld->type, min));
1747 assert(lp_check_value(bld->type, max));
1748
1749 a = lp_build_min(bld, a, max);
1750 a = lp_build_max(bld, a, min);
1751 return a;
1752 }
1753
1754
1755 /**
1756 * Generate clamp(a, 0, 1)
1757 * A NaN will get converted to zero.
1758 */
1759 LLVMValueRef
1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1761 LLVMValueRef a)
1762 {
1763 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1764 a = lp_build_min(bld, a, bld->one);
1765 return a;
1766 }
1767
1768
1769 /**
1770 * Generate abs(a)
1771 */
1772 LLVMValueRef
1773 lp_build_abs(struct lp_build_context *bld,
1774 LLVMValueRef a)
1775 {
1776 LLVMBuilderRef builder = bld->gallivm->builder;
1777 const struct lp_type type = bld->type;
1778 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1779
1780 assert(lp_check_value(type, a));
1781
1782 if(!type.sign)
1783 return a;
1784
1785 if(type.floating) {
1786 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1787 /* Workaround llvm.org/PR27332 */
1788 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1789 unsigned long long absMask = ~(1ULL << (type.width - 1));
1790 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1791 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1792 a = LLVMBuildAnd(builder, a, mask, "");
1793 a = LLVMBuildBitCast(builder, a, vec_type, "");
1794 return a;
1795 } else {
1796 char intrinsic[32];
1797 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1798 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1799 }
1800 }
1801
1802 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1803 switch(type.width) {
1804 case 8:
1805 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1806 case 16:
1807 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1808 case 32:
1809 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1810 }
1811 }
1812 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1813 switch(type.width) {
1814 case 8:
1815 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1816 case 16:
1817 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1818 case 32:
1819 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1820 }
1821 }
1822
1823 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1824 a, LLVMBuildNeg(builder, a, ""));
1825 }
1826
1827
1828 LLVMValueRef
1829 lp_build_negate(struct lp_build_context *bld,
1830 LLVMValueRef a)
1831 {
1832 LLVMBuilderRef builder = bld->gallivm->builder;
1833
1834 assert(lp_check_value(bld->type, a));
1835
1836 if (bld->type.floating)
1837 a = LLVMBuildFNeg(builder, a, "");
1838 else
1839 a = LLVMBuildNeg(builder, a, "");
1840
1841 return a;
1842 }
1843
1844
1845 /** Return -1, 0 or +1 depending on the sign of a */
1846 LLVMValueRef
1847 lp_build_sgn(struct lp_build_context *bld,
1848 LLVMValueRef a)
1849 {
1850 LLVMBuilderRef builder = bld->gallivm->builder;
1851 const struct lp_type type = bld->type;
1852 LLVMValueRef cond;
1853 LLVMValueRef res;
1854
1855 assert(lp_check_value(type, a));
1856
1857 /* Handle non-zero case */
1858 if(!type.sign) {
1859 /* if not zero then sign must be positive */
1860 res = bld->one;
1861 }
1862 else if(type.floating) {
1863 LLVMTypeRef vec_type;
1864 LLVMTypeRef int_type;
1865 LLVMValueRef mask;
1866 LLVMValueRef sign;
1867 LLVMValueRef one;
1868 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1869
1870 int_type = lp_build_int_vec_type(bld->gallivm, type);
1871 vec_type = lp_build_vec_type(bld->gallivm, type);
1872 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1873
1874 /* Take the sign bit and add it to 1 constant */
1875 sign = LLVMBuildBitCast(builder, a, int_type, "");
1876 sign = LLVMBuildAnd(builder, sign, mask, "");
1877 one = LLVMConstBitCast(bld->one, int_type);
1878 res = LLVMBuildOr(builder, sign, one, "");
1879 res = LLVMBuildBitCast(builder, res, vec_type, "");
1880 }
1881 else
1882 {
1883 /* signed int/norm/fixed point */
1884 /* could use psign with sse3 and appropriate vectors here */
1885 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1886 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1887 res = lp_build_select(bld, cond, bld->one, minus_one);
1888 }
1889
1890 /* Handle zero */
1891 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1892 res = lp_build_select(bld, cond, bld->zero, res);
1893
1894 return res;
1895 }
1896
1897
1898 /**
1899 * Set the sign of float vector 'a' according to 'sign'.
1900 * If sign==0, return abs(a).
1901 * If sign==1, return -abs(a);
1902 * Other values for sign produce undefined results.
1903 */
1904 LLVMValueRef
1905 lp_build_set_sign(struct lp_build_context *bld,
1906 LLVMValueRef a, LLVMValueRef sign)
1907 {
1908 LLVMBuilderRef builder = bld->gallivm->builder;
1909 const struct lp_type type = bld->type;
1910 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1911 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1912 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1913 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1914 ~((unsigned long long) 1 << (type.width - 1)));
1915 LLVMValueRef val, res;
1916
1917 assert(type.floating);
1918 assert(lp_check_value(type, a));
1919
1920 /* val = reinterpret_cast<int>(a) */
1921 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1922 /* val = val & mask */
1923 val = LLVMBuildAnd(builder, val, mask, "");
1924 /* sign = sign << shift */
1925 sign = LLVMBuildShl(builder, sign, shift, "");
1926 /* res = val | sign */
1927 res = LLVMBuildOr(builder, val, sign, "");
1928 /* res = reinterpret_cast<float>(res) */
1929 res = LLVMBuildBitCast(builder, res, vec_type, "");
1930
1931 return res;
1932 }
1933
1934
1935 /**
1936 * Convert vector of (or scalar) int to vector of (or scalar) float.
1937 */
1938 LLVMValueRef
1939 lp_build_int_to_float(struct lp_build_context *bld,
1940 LLVMValueRef a)
1941 {
1942 LLVMBuilderRef builder = bld->gallivm->builder;
1943 const struct lp_type type = bld->type;
1944 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1945
1946 assert(type.floating);
1947
1948 return LLVMBuildSIToFP(builder, a, vec_type, "");
1949 }
1950
1951 static boolean
1952 arch_rounding_available(const struct lp_type type)
1953 {
1954 if ((util_cpu_caps.has_sse4_1 &&
1955 (type.length == 1 || type.width*type.length == 128)) ||
1956 (util_cpu_caps.has_avx && type.width*type.length == 256))
1957 return TRUE;
1958 else if ((util_cpu_caps.has_altivec &&
1959 (type.width == 32 && type.length == 4)))
1960 return TRUE;
1961
1962 return FALSE;
1963 }
1964
1965 enum lp_build_round_mode
1966 {
1967 LP_BUILD_ROUND_NEAREST = 0,
1968 LP_BUILD_ROUND_FLOOR = 1,
1969 LP_BUILD_ROUND_CEIL = 2,
1970 LP_BUILD_ROUND_TRUNCATE = 3
1971 };
1972
1973 static inline LLVMValueRef
1974 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1975 LLVMValueRef a)
1976 {
1977 LLVMBuilderRef builder = bld->gallivm->builder;
1978 const struct lp_type type = bld->type;
1979 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1980 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1981 const char *intrinsic;
1982 LLVMValueRef res;
1983
1984 assert(type.floating);
1985 /* using the double precision conversions is a bit more complicated */
1986 assert(type.width == 32);
1987
1988 assert(lp_check_value(type, a));
1989 assert(util_cpu_caps.has_sse2);
1990
1991 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1992 if (type.length == 1) {
1993 LLVMTypeRef vec_type;
1994 LLVMValueRef undef;
1995 LLVMValueRef arg;
1996 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1997
1998 vec_type = LLVMVectorType(bld->elem_type, 4);
1999
2000 intrinsic = "llvm.x86.sse.cvtss2si";
2001
2002 undef = LLVMGetUndef(vec_type);
2003
2004 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2005
2006 res = lp_build_intrinsic_unary(builder, intrinsic,
2007 ret_type, arg);
2008 }
2009 else {
2010 if (type.width* type.length == 128) {
2011 intrinsic = "llvm.x86.sse2.cvtps2dq";
2012 }
2013 else {
2014 assert(type.width*type.length == 256);
2015 assert(util_cpu_caps.has_avx);
2016
2017 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2018 }
2019 res = lp_build_intrinsic_unary(builder, intrinsic,
2020 ret_type, a);
2021 }
2022
2023 return res;
2024 }
2025
2026
2027 /*
2028 */
2029 static inline LLVMValueRef
2030 lp_build_round_altivec(struct lp_build_context *bld,
2031 LLVMValueRef a,
2032 enum lp_build_round_mode mode)
2033 {
2034 LLVMBuilderRef builder = bld->gallivm->builder;
2035 const struct lp_type type = bld->type;
2036 const char *intrinsic = NULL;
2037
2038 assert(type.floating);
2039
2040 assert(lp_check_value(type, a));
2041 assert(util_cpu_caps.has_altivec);
2042
2043 (void)type;
2044
2045 switch (mode) {
2046 case LP_BUILD_ROUND_NEAREST:
2047 intrinsic = "llvm.ppc.altivec.vrfin";
2048 break;
2049 case LP_BUILD_ROUND_FLOOR:
2050 intrinsic = "llvm.ppc.altivec.vrfim";
2051 break;
2052 case LP_BUILD_ROUND_CEIL:
2053 intrinsic = "llvm.ppc.altivec.vrfip";
2054 break;
2055 case LP_BUILD_ROUND_TRUNCATE:
2056 intrinsic = "llvm.ppc.altivec.vrfiz";
2057 break;
2058 }
2059
2060 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2061 }
2062
2063 static inline LLVMValueRef
2064 lp_build_round_arch(struct lp_build_context *bld,
2065 LLVMValueRef a,
2066 enum lp_build_round_mode mode)
2067 {
2068 if (util_cpu_caps.has_sse4_1) {
2069 LLVMBuilderRef builder = bld->gallivm->builder;
2070 const struct lp_type type = bld->type;
2071 const char *intrinsic_root;
2072 char intrinsic[32];
2073
2074 assert(type.floating);
2075 assert(lp_check_value(type, a));
2076 (void)type;
2077
2078 switch (mode) {
2079 case LP_BUILD_ROUND_NEAREST:
2080 intrinsic_root = "llvm.nearbyint";
2081 break;
2082 case LP_BUILD_ROUND_FLOOR:
2083 intrinsic_root = "llvm.floor";
2084 break;
2085 case LP_BUILD_ROUND_CEIL:
2086 intrinsic_root = "llvm.ceil";
2087 break;
2088 case LP_BUILD_ROUND_TRUNCATE:
2089 intrinsic_root = "llvm.trunc";
2090 break;
2091 }
2092
2093 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2094 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2095 }
2096 else /* (util_cpu_caps.has_altivec) */
2097 return lp_build_round_altivec(bld, a, mode);
2098 }
2099
2100 /**
2101 * Return the integer part of a float (vector) value (== round toward zero).
2102 * The returned value is a float (vector).
2103 * Ex: trunc(-1.5) = -1.0
2104 */
2105 LLVMValueRef
2106 lp_build_trunc(struct lp_build_context *bld,
2107 LLVMValueRef a)
2108 {
2109 LLVMBuilderRef builder = bld->gallivm->builder;
2110 const struct lp_type type = bld->type;
2111
2112 assert(type.floating);
2113 assert(lp_check_value(type, a));
2114
2115 if (arch_rounding_available(type)) {
2116 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2117 }
2118 else {
2119 const struct lp_type type = bld->type;
2120 struct lp_type inttype;
2121 struct lp_build_context intbld;
2122 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2123 LLVMValueRef trunc, res, anosign, mask;
2124 LLVMTypeRef int_vec_type = bld->int_vec_type;
2125 LLVMTypeRef vec_type = bld->vec_type;
2126
2127 assert(type.width == 32); /* might want to handle doubles at some point */
2128
2129 inttype = type;
2130 inttype.floating = 0;
2131 lp_build_context_init(&intbld, bld->gallivm, inttype);
2132
2133 /* round by truncation */
2134 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2135 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2136
2137 /* mask out sign bit */
2138 anosign = lp_build_abs(bld, a);
2139 /*
2140 * mask out all values if anosign > 2^24
2141 * This should work both for large ints (all rounding is no-op for them
2142 * because such floats are always exact) as well as special cases like
2143 * NaNs, Infs (taking advantage of the fact they use max exponent).
2144 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2145 */
2146 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2147 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2148 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2149 return lp_build_select(bld, mask, a, res);
2150 }
2151 }
2152
2153
2154 /**
2155 * Return float (vector) rounded to nearest integer (vector). The returned
2156 * value is a float (vector).
2157 * Ex: round(0.9) = 1.0
2158 * Ex: round(-1.5) = -2.0
2159 */
2160 LLVMValueRef
2161 lp_build_round(struct lp_build_context *bld,
2162 LLVMValueRef a)
2163 {
2164 LLVMBuilderRef builder = bld->gallivm->builder;
2165 const struct lp_type type = bld->type;
2166
2167 assert(type.floating);
2168 assert(lp_check_value(type, a));
2169
2170 if (arch_rounding_available(type)) {
2171 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2172 }
2173 else {
2174 const struct lp_type type = bld->type;
2175 struct lp_type inttype;
2176 struct lp_build_context intbld;
2177 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2178 LLVMValueRef res, anosign, mask;
2179 LLVMTypeRef int_vec_type = bld->int_vec_type;
2180 LLVMTypeRef vec_type = bld->vec_type;
2181
2182 assert(type.width == 32); /* might want to handle doubles at some point */
2183
2184 inttype = type;
2185 inttype.floating = 0;
2186 lp_build_context_init(&intbld, bld->gallivm, inttype);
2187
2188 res = lp_build_iround(bld, a);
2189 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2190
2191 /* mask out sign bit */
2192 anosign = lp_build_abs(bld, a);
2193 /*
2194 * mask out all values if anosign > 2^24
2195 * This should work both for large ints (all rounding is no-op for them
2196 * because such floats are always exact) as well as special cases like
2197 * NaNs, Infs (taking advantage of the fact they use max exponent).
2198 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2199 */
2200 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2201 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2202 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2203 return lp_build_select(bld, mask, a, res);
2204 }
2205 }
2206
2207
2208 /**
2209 * Return floor of float (vector), result is a float (vector)
2210 * Ex: floor(1.1) = 1.0
2211 * Ex: floor(-1.1) = -2.0
2212 */
2213 LLVMValueRef
2214 lp_build_floor(struct lp_build_context *bld,
2215 LLVMValueRef a)
2216 {
2217 LLVMBuilderRef builder = bld->gallivm->builder;
2218 const struct lp_type type = bld->type;
2219
2220 assert(type.floating);
2221 assert(lp_check_value(type, a));
2222
2223 if (arch_rounding_available(type)) {
2224 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2225 }
2226 else {
2227 const struct lp_type type = bld->type;
2228 struct lp_type inttype;
2229 struct lp_build_context intbld;
2230 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2231 LLVMValueRef trunc, res, anosign, mask;
2232 LLVMTypeRef int_vec_type = bld->int_vec_type;
2233 LLVMTypeRef vec_type = bld->vec_type;
2234
2235 if (type.width != 32) {
2236 char intrinsic[32];
2237 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2238 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2239 }
2240
2241 assert(type.width == 32); /* might want to handle doubles at some point */
2242
2243 inttype = type;
2244 inttype.floating = 0;
2245 lp_build_context_init(&intbld, bld->gallivm, inttype);
2246
2247 /* round by truncation */
2248 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2249 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2250
2251 if (type.sign) {
2252 LLVMValueRef tmp;
2253
2254 /*
2255 * fix values if rounding is wrong (for non-special cases)
2256 * - this is the case if trunc > a
2257 */
2258 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2259 /* tmp = trunc > a ? 1.0 : 0.0 */
2260 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2261 tmp = lp_build_and(&intbld, mask, tmp);
2262 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2263 res = lp_build_sub(bld, res, tmp);
2264 }
2265
2266 /* mask out sign bit */
2267 anosign = lp_build_abs(bld, a);
2268 /*
2269 * mask out all values if anosign > 2^24
2270 * This should work both for large ints (all rounding is no-op for them
2271 * because such floats are always exact) as well as special cases like
2272 * NaNs, Infs (taking advantage of the fact they use max exponent).
2273 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2274 */
2275 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2276 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2277 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2278 return lp_build_select(bld, mask, a, res);
2279 }
2280 }
2281
2282
2283 /**
2284 * Return ceiling of float (vector), returning float (vector).
2285 * Ex: ceil( 1.1) = 2.0
2286 * Ex: ceil(-1.1) = -1.0
2287 */
2288 LLVMValueRef
2289 lp_build_ceil(struct lp_build_context *bld,
2290 LLVMValueRef a)
2291 {
2292 LLVMBuilderRef builder = bld->gallivm->builder;
2293 const struct lp_type type = bld->type;
2294
2295 assert(type.floating);
2296 assert(lp_check_value(type, a));
2297
2298 if (arch_rounding_available(type)) {
2299 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2300 }
2301 else {
2302 const struct lp_type type = bld->type;
2303 struct lp_type inttype;
2304 struct lp_build_context intbld;
2305 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2306 LLVMValueRef trunc, res, anosign, mask, tmp;
2307 LLVMTypeRef int_vec_type = bld->int_vec_type;
2308 LLVMTypeRef vec_type = bld->vec_type;
2309
2310 if (type.width != 32) {
2311 char intrinsic[32];
2312 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2313 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2314 }
2315
2316 assert(type.width == 32); /* might want to handle doubles at some point */
2317
2318 inttype = type;
2319 inttype.floating = 0;
2320 lp_build_context_init(&intbld, bld->gallivm, inttype);
2321
2322 /* round by truncation */
2323 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2324 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2325
2326 /*
2327 * fix values if rounding is wrong (for non-special cases)
2328 * - this is the case if trunc < a
2329 */
2330 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2331 /* tmp = trunc < a ? 1.0 : 0.0 */
2332 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2333 tmp = lp_build_and(&intbld, mask, tmp);
2334 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2335 res = lp_build_add(bld, trunc, tmp);
2336
2337 /* mask out sign bit */
2338 anosign = lp_build_abs(bld, a);
2339 /*
2340 * mask out all values if anosign > 2^24
2341 * This should work both for large ints (all rounding is no-op for them
2342 * because such floats are always exact) as well as special cases like
2343 * NaNs, Infs (taking advantage of the fact they use max exponent).
2344 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2345 */
2346 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2347 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2348 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2349 return lp_build_select(bld, mask, a, res);
2350 }
2351 }
2352
2353
2354 /**
2355 * Return fractional part of 'a' computed as a - floor(a)
2356 * Typically used in texture coord arithmetic.
2357 */
2358 LLVMValueRef
2359 lp_build_fract(struct lp_build_context *bld,
2360 LLVMValueRef a)
2361 {
2362 assert(bld->type.floating);
2363 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2364 }
2365
2366
2367 /**
2368 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2369 * against 0.99999(9). (Will also return that value for NaNs.)
2370 */
2371 static inline LLVMValueRef
2372 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2373 {
2374 LLVMValueRef max;
2375
2376 /* this is the largest number smaller than 1.0 representable as float */
2377 max = lp_build_const_vec(bld->gallivm, bld->type,
2378 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2379 return lp_build_min_ext(bld, fract, max,
2380 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2381 }
2382
2383
2384 /**
2385 * Same as lp_build_fract, but guarantees that the result is always smaller
2386 * than one. Will also return the smaller-than-one value for infs, NaNs.
2387 */
2388 LLVMValueRef
2389 lp_build_fract_safe(struct lp_build_context *bld,
2390 LLVMValueRef a)
2391 {
2392 return clamp_fract(bld, lp_build_fract(bld, a));
2393 }
2394
2395
2396 /**
2397 * Return the integer part of a float (vector) value (== round toward zero).
2398 * The returned value is an integer (vector).
2399 * Ex: itrunc(-1.5) = -1
2400 */
2401 LLVMValueRef
2402 lp_build_itrunc(struct lp_build_context *bld,
2403 LLVMValueRef a)
2404 {
2405 LLVMBuilderRef builder = bld->gallivm->builder;
2406 const struct lp_type type = bld->type;
2407 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2408
2409 assert(type.floating);
2410 assert(lp_check_value(type, a));
2411
2412 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2413 }
2414
2415
2416 /**
2417 * Return float (vector) rounded to nearest integer (vector). The returned
2418 * value is an integer (vector).
2419 * Ex: iround(0.9) = 1
2420 * Ex: iround(-1.5) = -2
2421 */
2422 LLVMValueRef
2423 lp_build_iround(struct lp_build_context *bld,
2424 LLVMValueRef a)
2425 {
2426 LLVMBuilderRef builder = bld->gallivm->builder;
2427 const struct lp_type type = bld->type;
2428 LLVMTypeRef int_vec_type = bld->int_vec_type;
2429 LLVMValueRef res;
2430
2431 assert(type.floating);
2432
2433 assert(lp_check_value(type, a));
2434
2435 if ((util_cpu_caps.has_sse2 &&
2436 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2437 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2438 return lp_build_iround_nearest_sse2(bld, a);
2439 }
2440 if (arch_rounding_available(type)) {
2441 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2442 }
2443 else {
2444 LLVMValueRef half;
2445
2446 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2447
2448 if (type.sign) {
2449 LLVMTypeRef vec_type = bld->vec_type;
2450 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2451 (unsigned long long)1 << (type.width - 1));
2452 LLVMValueRef sign;
2453
2454 /* get sign bit */
2455 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2456 sign = LLVMBuildAnd(builder, sign, mask, "");
2457
2458 /* sign * 0.5 */
2459 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2460 half = LLVMBuildOr(builder, sign, half, "");
2461 half = LLVMBuildBitCast(builder, half, vec_type, "");
2462 }
2463
2464 res = LLVMBuildFAdd(builder, a, half, "");
2465 }
2466
2467 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2468
2469 return res;
2470 }
2471
2472
2473 /**
2474 * Return floor of float (vector), result is an int (vector)
2475 * Ex: ifloor(1.1) = 1.0
2476 * Ex: ifloor(-1.1) = -2.0
2477 */
2478 LLVMValueRef
2479 lp_build_ifloor(struct lp_build_context *bld,
2480 LLVMValueRef a)
2481 {
2482 LLVMBuilderRef builder = bld->gallivm->builder;
2483 const struct lp_type type = bld->type;
2484 LLVMTypeRef int_vec_type = bld->int_vec_type;
2485 LLVMValueRef res;
2486
2487 assert(type.floating);
2488 assert(lp_check_value(type, a));
2489
2490 res = a;
2491 if (type.sign) {
2492 if (arch_rounding_available(type)) {
2493 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2494 }
2495 else {
2496 struct lp_type inttype;
2497 struct lp_build_context intbld;
2498 LLVMValueRef trunc, itrunc, mask;
2499
2500 assert(type.floating);
2501 assert(lp_check_value(type, a));
2502
2503 inttype = type;
2504 inttype.floating = 0;
2505 lp_build_context_init(&intbld, bld->gallivm, inttype);
2506
2507 /* round by truncation */
2508 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2509 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2510
2511 /*
2512 * fix values if rounding is wrong (for non-special cases)
2513 * - this is the case if trunc > a
2514 * The results of doing this with NaNs, very large values etc.
2515 * are undefined but this seems to be the case anyway.
2516 */
2517 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2518 /* cheapie minus one with mask since the mask is minus one / zero */
2519 return lp_build_add(&intbld, itrunc, mask);
2520 }
2521 }
2522
2523 /* round to nearest (toward zero) */
2524 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2525
2526 return res;
2527 }
2528
2529
2530 /**
2531 * Return ceiling of float (vector), returning int (vector).
2532 * Ex: iceil( 1.1) = 2
2533 * Ex: iceil(-1.1) = -1
2534 */
2535 LLVMValueRef
2536 lp_build_iceil(struct lp_build_context *bld,
2537 LLVMValueRef a)
2538 {
2539 LLVMBuilderRef builder = bld->gallivm->builder;
2540 const struct lp_type type = bld->type;
2541 LLVMTypeRef int_vec_type = bld->int_vec_type;
2542 LLVMValueRef res;
2543
2544 assert(type.floating);
2545 assert(lp_check_value(type, a));
2546
2547 if (arch_rounding_available(type)) {
2548 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2549 }
2550 else {
2551 struct lp_type inttype;
2552 struct lp_build_context intbld;
2553 LLVMValueRef trunc, itrunc, mask;
2554
2555 assert(type.floating);
2556 assert(lp_check_value(type, a));
2557
2558 inttype = type;
2559 inttype.floating = 0;
2560 lp_build_context_init(&intbld, bld->gallivm, inttype);
2561
2562 /* round by truncation */
2563 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2564 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2565
2566 /*
2567 * fix values if rounding is wrong (for non-special cases)
2568 * - this is the case if trunc < a
2569 * The results of doing this with NaNs, very large values etc.
2570 * are undefined but this seems to be the case anyway.
2571 */
2572 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2573 /* cheapie plus one with mask since the mask is minus one / zero */
2574 return lp_build_sub(&intbld, itrunc, mask);
2575 }
2576
2577 /* round to nearest (toward zero) */
2578 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2579
2580 return res;
2581 }
2582
2583
2584 /**
2585 * Combined ifloor() & fract().
2586 *
2587 * Preferred to calling the functions separately, as it will ensure that the
2588 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2589 */
2590 void
2591 lp_build_ifloor_fract(struct lp_build_context *bld,
2592 LLVMValueRef a,
2593 LLVMValueRef *out_ipart,
2594 LLVMValueRef *out_fpart)
2595 {
2596 LLVMBuilderRef builder = bld->gallivm->builder;
2597 const struct lp_type type = bld->type;
2598 LLVMValueRef ipart;
2599
2600 assert(type.floating);
2601 assert(lp_check_value(type, a));
2602
2603 if (arch_rounding_available(type)) {
2604 /*
2605 * floor() is easier.
2606 */
2607
2608 ipart = lp_build_floor(bld, a);
2609 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2610 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2611 }
2612 else {
2613 /*
2614 * ifloor() is easier.
2615 */
2616
2617 *out_ipart = lp_build_ifloor(bld, a);
2618 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2619 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2620 }
2621 }
2622
2623
2624 /**
2625 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2626 * always smaller than one.
2627 */
2628 void
2629 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2630 LLVMValueRef a,
2631 LLVMValueRef *out_ipart,
2632 LLVMValueRef *out_fpart)
2633 {
2634 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2635 *out_fpart = clamp_fract(bld, *out_fpart);
2636 }
2637
2638
2639 LLVMValueRef
2640 lp_build_sqrt(struct lp_build_context *bld,
2641 LLVMValueRef a)
2642 {
2643 LLVMBuilderRef builder = bld->gallivm->builder;
2644 const struct lp_type type = bld->type;
2645 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2646 char intrinsic[32];
2647
2648 assert(lp_check_value(type, a));
2649
2650 assert(type.floating);
2651 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2652
2653 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2654 }
2655
2656
2657 /**
2658 * Do one Newton-Raphson step to improve reciprocate precision:
2659 *
2660 * x_{i+1} = x_i * (2 - a * x_i)
2661 *
2662 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2663 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2664 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2665 * halo. It would be necessary to clamp the argument to prevent this.
2666 *
2667 * See also:
2668 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2669 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2670 */
2671 static inline LLVMValueRef
2672 lp_build_rcp_refine(struct lp_build_context *bld,
2673 LLVMValueRef a,
2674 LLVMValueRef rcp_a)
2675 {
2676 LLVMBuilderRef builder = bld->gallivm->builder;
2677 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2678 LLVMValueRef res;
2679
2680 res = LLVMBuildFMul(builder, a, rcp_a, "");
2681 res = LLVMBuildFSub(builder, two, res, "");
2682 res = LLVMBuildFMul(builder, rcp_a, res, "");
2683
2684 return res;
2685 }
2686
2687
2688 LLVMValueRef
2689 lp_build_rcp(struct lp_build_context *bld,
2690 LLVMValueRef a)
2691 {
2692 LLVMBuilderRef builder = bld->gallivm->builder;
2693 const struct lp_type type = bld->type;
2694
2695 assert(lp_check_value(type, a));
2696
2697 if(a == bld->zero)
2698 return bld->undef;
2699 if(a == bld->one)
2700 return bld->one;
2701 if(a == bld->undef)
2702 return bld->undef;
2703
2704 assert(type.floating);
2705
2706 if(LLVMIsConstant(a))
2707 return LLVMConstFDiv(bld->one, a);
2708
2709 /*
2710 * We don't use RCPPS because:
2711 * - it only has 10bits of precision
2712 * - it doesn't even get the reciprocate of 1.0 exactly
2713 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2714 * - for recent processors the benefit over DIVPS is marginal, a case
2715 * dependent
2716 *
2717 * We could still use it on certain processors if benchmarks show that the
2718 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2719 * particular uses that require less workarounds.
2720 */
2721
2722 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2723 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2724 const unsigned num_iterations = 0;
2725 LLVMValueRef res;
2726 unsigned i;
2727 const char *intrinsic = NULL;
2728
2729 if (type.length == 4) {
2730 intrinsic = "llvm.x86.sse.rcp.ps";
2731 }
2732 else {
2733 intrinsic = "llvm.x86.avx.rcp.ps.256";
2734 }
2735
2736 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2737
2738 for (i = 0; i < num_iterations; ++i) {
2739 res = lp_build_rcp_refine(bld, a, res);
2740 }
2741
2742 return res;
2743 }
2744
2745 return LLVMBuildFDiv(builder, bld->one, a, "");
2746 }
2747
2748
2749 /**
2750 * Do one Newton-Raphson step to improve rsqrt precision:
2751 *
2752 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2753 *
2754 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2755 */
2756 static inline LLVMValueRef
2757 lp_build_rsqrt_refine(struct lp_build_context *bld,
2758 LLVMValueRef a,
2759 LLVMValueRef rsqrt_a)
2760 {
2761 LLVMBuilderRef builder = bld->gallivm->builder;
2762 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2763 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2764 LLVMValueRef res;
2765
2766 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2767 res = LLVMBuildFMul(builder, a, res, "");
2768 res = LLVMBuildFSub(builder, three, res, "");
2769 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2770 res = LLVMBuildFMul(builder, half, res, "");
2771
2772 return res;
2773 }
2774
2775
2776 /**
2777 * Generate 1/sqrt(a).
2778 * Result is undefined for values < 0, infinity for +0.
2779 */
2780 LLVMValueRef
2781 lp_build_rsqrt(struct lp_build_context *bld,
2782 LLVMValueRef a)
2783 {
2784 const struct lp_type type = bld->type;
2785
2786 assert(lp_check_value(type, a));
2787
2788 assert(type.floating);
2789
2790 /*
2791 * This should be faster but all denormals will end up as infinity.
2792 */
2793 if (0 && lp_build_fast_rsqrt_available(type)) {
2794 const unsigned num_iterations = 1;
2795 LLVMValueRef res;
2796 unsigned i;
2797
2798 /* rsqrt(1.0) != 1.0 here */
2799 res = lp_build_fast_rsqrt(bld, a);
2800
2801 if (num_iterations) {
2802 /*
2803 * Newton-Raphson will result in NaN instead of infinity for zero,
2804 * and NaN instead of zero for infinity.
2805 * Also, need to ensure rsqrt(1.0) == 1.0.
2806 * All numbers smaller than FLT_MIN will result in +infinity
2807 * (rsqrtps treats all denormals as zero).
2808 */
2809 LLVMValueRef cmp;
2810 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2811 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2812
2813 for (i = 0; i < num_iterations; ++i) {
2814 res = lp_build_rsqrt_refine(bld, a, res);
2815 }
2816 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2817 res = lp_build_select(bld, cmp, inf, res);
2818 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2819 res = lp_build_select(bld, cmp, bld->zero, res);
2820 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2821 res = lp_build_select(bld, cmp, bld->one, res);
2822 }
2823
2824 return res;
2825 }
2826
2827 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2828 }
2829
2830 /**
2831 * If there's a fast (inaccurate) rsqrt instruction available
2832 * (caller may want to avoid to call rsqrt_fast if it's not available,
2833 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2834 * unavailable it would result in sqrt/div/mul so obviously
2835 * much better to just call sqrt, skipping both div and mul).
2836 */
2837 boolean
2838 lp_build_fast_rsqrt_available(struct lp_type type)
2839 {
2840 assert(type.floating);
2841
2842 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2843 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2844 return true;
2845 }
2846 return false;
2847 }
2848
2849
2850 /**
2851 * Generate 1/sqrt(a).
2852 * Result is undefined for values < 0, infinity for +0.
2853 * Precision is limited, only ~10 bits guaranteed
2854 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2855 */
2856 LLVMValueRef
2857 lp_build_fast_rsqrt(struct lp_build_context *bld,
2858 LLVMValueRef a)
2859 {
2860 LLVMBuilderRef builder = bld->gallivm->builder;
2861 const struct lp_type type = bld->type;
2862
2863 assert(lp_check_value(type, a));
2864
2865 if (lp_build_fast_rsqrt_available(type)) {
2866 const char *intrinsic = NULL;
2867
2868 if (type.length == 4) {
2869 intrinsic = "llvm.x86.sse.rsqrt.ps";
2870 }
2871 else {
2872 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2873 }
2874 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2875 }
2876 else {
2877 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2878 }
2879 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2880 }
2881
2882
2883 /**
2884 * Generate sin(a) or cos(a) using polynomial approximation.
2885 * TODO: it might be worth recognizing sin and cos using same source
2886 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2887 * would be way cheaper than calculating (nearly) everything twice...
2888 * Not sure it's common enough to be worth bothering however, scs
2889 * opcode could also benefit from calculating both though.
2890 */
2891 static LLVMValueRef
2892 lp_build_sin_or_cos(struct lp_build_context *bld,
2893 LLVMValueRef a,
2894 boolean cos)
2895 {
2896 struct gallivm_state *gallivm = bld->gallivm;
2897 LLVMBuilderRef b = gallivm->builder;
2898 struct lp_type int_type = lp_int_type(bld->type);
2899
2900 /*
2901 * take the absolute value,
2902 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2903 */
2904
2905 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2906 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2907
2908 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2909 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2910
2911 /*
2912 * scale by 4/Pi
2913 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2914 */
2915
2916 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2917 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2918
2919 /*
2920 * store the integer part of y in mm0
2921 * emm2 = _mm_cvttps_epi32(y);
2922 */
2923
2924 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2925
2926 /*
2927 * j=(j+1) & (~1) (see the cephes sources)
2928 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2929 */
2930
2931 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2932 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2933 /*
2934 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2935 */
2936 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2937 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2938
2939 /*
2940 * y = _mm_cvtepi32_ps(emm2);
2941 */
2942 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2943
2944 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2945 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2946 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2947 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2948
2949 /*
2950 * Argument used for poly selection and sign bit determination
2951 * is different for sin vs. cos.
2952 */
2953 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2954 emm2_and;
2955
2956 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2957 LLVMBuildNot(b, emm2_2, ""), ""),
2958 const_29, "sign_bit") :
2959 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2960 LLVMBuildShl(b, emm2_add,
2961 const_29, ""), ""),
2962 sign_mask, "sign_bit");
2963
2964 /*
2965 * get the polynom selection mask
2966 * there is one polynom for 0 <= x <= Pi/4
2967 * and another one for Pi/4<x<=Pi/2
2968 * Both branches will be computed.
2969 *
2970 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2971 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2972 */
2973
2974 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2975 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2976 int_type, PIPE_FUNC_EQUAL,
2977 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2978
2979 /*
2980 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2981 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2982 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2983 */
2984 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2985 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2986 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2987
2988 /*
2989 * The magic pass: "Extended precision modular arithmetic"
2990 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2991 */
2992 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2993 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2994 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2995
2996 /*
2997 * Evaluate the first polynom (0 <= x <= Pi/4)
2998 *
2999 * z = _mm_mul_ps(x,x);
3000 */
3001 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3002
3003 /*
3004 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3005 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3006 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3007 */
3008 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3009 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3010 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3011
3012 /*
3013 * y = *(v4sf*)_ps_coscof_p0;
3014 * y = _mm_mul_ps(y, z);
3015 */
3016 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3017 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3018 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3019 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3020
3021
3022 /*
3023 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3024 * y = _mm_sub_ps(y, tmp);
3025 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3026 */
3027 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3028 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3029 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3030 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3031 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3032
3033 /*
3034 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3035 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3036 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3037 */
3038 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3039 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3040 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3041
3042 /*
3043 * Evaluate the second polynom (Pi/4 <= x <= 0)
3044 *
3045 * y2 = *(v4sf*)_ps_sincof_p0;
3046 * y2 = _mm_mul_ps(y2, z);
3047 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3048 * y2 = _mm_mul_ps(y2, z);
3049 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3050 * y2 = _mm_mul_ps(y2, z);
3051 * y2 = _mm_mul_ps(y2, x);
3052 * y2 = _mm_add_ps(y2, x);
3053 */
3054
3055 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3056 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3057 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3058 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3059
3060 /*
3061 * select the correct result from the two polynoms
3062 * xmm3 = poly_mask;
3063 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3064 * y = _mm_andnot_ps(xmm3, y);
3065 * y = _mm_or_ps(y,y2);
3066 */
3067 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3068 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3069 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3070 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3071 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3072 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3073
3074 /*
3075 * update the sign
3076 * y = _mm_xor_ps(y, sign_bit);
3077 */
3078 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3079 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3080
3081 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3082
3083 /* clamp output to be within [-1, 1] */
3084 y_result = lp_build_clamp(bld, y_result,
3085 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3086 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3087 /* If a is -inf, inf or NaN then return NaN */
3088 y_result = lp_build_select(bld, isfinite, y_result,
3089 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3090 return y_result;
3091 }
3092
3093
3094 /**
3095 * Generate sin(a)
3096 */
3097 LLVMValueRef
3098 lp_build_sin(struct lp_build_context *bld,
3099 LLVMValueRef a)
3100 {
3101 return lp_build_sin_or_cos(bld, a, FALSE);
3102 }
3103
3104
3105 /**
3106 * Generate cos(a)
3107 */
3108 LLVMValueRef
3109 lp_build_cos(struct lp_build_context *bld,
3110 LLVMValueRef a)
3111 {
3112 return lp_build_sin_or_cos(bld, a, TRUE);
3113 }
3114
3115
3116 /**
3117 * Generate pow(x, y)
3118 */
3119 LLVMValueRef
3120 lp_build_pow(struct lp_build_context *bld,
3121 LLVMValueRef x,
3122 LLVMValueRef y)
3123 {
3124 /* TODO: optimize the constant case */
3125 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3126 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3127 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3128 __FUNCTION__);
3129 }
3130
3131 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3132 }
3133
3134
3135 /**
3136 * Generate exp(x)
3137 */
3138 LLVMValueRef
3139 lp_build_exp(struct lp_build_context *bld,
3140 LLVMValueRef x)
3141 {
3142 /* log2(e) = 1/log(2) */
3143 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3144 1.4426950408889634);
3145
3146 assert(lp_check_value(bld->type, x));
3147
3148 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3149 }
3150
3151
3152 /**
3153 * Generate log(x)
3154 * Behavior is undefined with infs, 0s and nans
3155 */
3156 LLVMValueRef
3157 lp_build_log(struct lp_build_context *bld,
3158 LLVMValueRef x)
3159 {
3160 /* log(2) */
3161 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3162 0.69314718055994529);
3163
3164 assert(lp_check_value(bld->type, x));
3165
3166 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3167 }
3168
3169 /**
3170 * Generate log(x) that handles edge cases (infs, 0s and nans)
3171 */
3172 LLVMValueRef
3173 lp_build_log_safe(struct lp_build_context *bld,
3174 LLVMValueRef x)
3175 {
3176 /* log(2) */
3177 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3178 0.69314718055994529);
3179
3180 assert(lp_check_value(bld->type, x));
3181
3182 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3183 }
3184
3185
3186 /**
3187 * Generate polynomial.
3188 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3189 */
3190 LLVMValueRef
3191 lp_build_polynomial(struct lp_build_context *bld,
3192 LLVMValueRef x,
3193 const double *coeffs,
3194 unsigned num_coeffs)
3195 {
3196 const struct lp_type type = bld->type;
3197 LLVMValueRef even = NULL, odd = NULL;
3198 LLVMValueRef x2;
3199 unsigned i;
3200
3201 assert(lp_check_value(bld->type, x));
3202
3203 /* TODO: optimize the constant case */
3204 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3205 LLVMIsConstant(x)) {
3206 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3207 __FUNCTION__);
3208 }
3209
3210 /*
3211 * Calculate odd and even terms seperately to decrease data dependency
3212 * Ex:
3213 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3214 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3215 */
3216 x2 = lp_build_mul(bld, x, x);
3217
3218 for (i = num_coeffs; i--; ) {
3219 LLVMValueRef coeff;
3220
3221 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3222
3223 if (i % 2 == 0) {
3224 if (even)
3225 even = lp_build_mad(bld, x2, even, coeff);
3226 else
3227 even = coeff;
3228 } else {
3229 if (odd)
3230 odd = lp_build_mad(bld, x2, odd, coeff);
3231 else
3232 odd = coeff;
3233 }
3234 }
3235
3236 if (odd)
3237 return lp_build_mad(bld, odd, x, even);
3238 else if (even)
3239 return even;
3240 else
3241 return bld->undef;
3242 }
3243
3244
3245 /**
3246 * Minimax polynomial fit of 2**x, in range [0, 1[
3247 */
3248 const double lp_build_exp2_polynomial[] = {
3249 #if EXP_POLY_DEGREE == 5
3250 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3251 0.693153073200168932794,
3252 0.240153617044375388211,
3253 0.0558263180532956664775,
3254 0.00898934009049466391101,
3255 0.00187757667519147912699
3256 #elif EXP_POLY_DEGREE == 4
3257 1.00000259337069434683,
3258 0.693003834469974940458,
3259 0.24144275689150793076,
3260 0.0520114606103070150235,
3261 0.0135341679161270268764
3262 #elif EXP_POLY_DEGREE == 3
3263 0.999925218562710312959,
3264 0.695833540494823811697,
3265 0.226067155427249155588,
3266 0.0780245226406372992967
3267 #elif EXP_POLY_DEGREE == 2
3268 1.00172476321474503578,
3269 0.657636275736077639316,
3270 0.33718943461968720704
3271 #else
3272 #error
3273 #endif
3274 };
3275
3276
3277 LLVMValueRef
3278 lp_build_exp2(struct lp_build_context *bld,
3279 LLVMValueRef x)
3280 {
3281 LLVMBuilderRef builder = bld->gallivm->builder;
3282 const struct lp_type type = bld->type;
3283 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3284 LLVMValueRef ipart = NULL;
3285 LLVMValueRef fpart = NULL;
3286 LLVMValueRef expipart = NULL;
3287 LLVMValueRef expfpart = NULL;
3288 LLVMValueRef res = NULL;
3289
3290 assert(lp_check_value(bld->type, x));
3291
3292 /* TODO: optimize the constant case */
3293 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3294 LLVMIsConstant(x)) {
3295 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3296 __FUNCTION__);
3297 }
3298
3299 assert(type.floating && type.width == 32);
3300
3301 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3302 * the result is INF and if it's smaller than -126.9 the result is 0 */
3303 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3304 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3305 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3306 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3307
3308 /* ipart = floor(x) */
3309 /* fpart = x - ipart */
3310 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3311
3312 /* expipart = (float) (1 << ipart) */
3313 expipart = LLVMBuildAdd(builder, ipart,
3314 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3315 expipart = LLVMBuildShl(builder, expipart,
3316 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3317 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3318
3319 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3320 ARRAY_SIZE(lp_build_exp2_polynomial));
3321
3322 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3323
3324 return res;
3325 }
3326
3327
3328
3329 /**
3330 * Extract the exponent of a IEEE-754 floating point value.
3331 *
3332 * Optionally apply an integer bias.
3333 *
3334 * Result is an integer value with
3335 *
3336 * ifloor(log2(x)) + bias
3337 */
3338 LLVMValueRef
3339 lp_build_extract_exponent(struct lp_build_context *bld,
3340 LLVMValueRef x,
3341 int bias)
3342 {
3343 LLVMBuilderRef builder = bld->gallivm->builder;
3344 const struct lp_type type = bld->type;
3345 unsigned mantissa = lp_mantissa(type);
3346 LLVMValueRef res;
3347
3348 assert(type.floating);
3349
3350 assert(lp_check_value(bld->type, x));
3351
3352 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3353
3354 res = LLVMBuildLShr(builder, x,
3355 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3356 res = LLVMBuildAnd(builder, res,
3357 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3358 res = LLVMBuildSub(builder, res,
3359 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3360
3361 return res;
3362 }
3363
3364
3365 /**
3366 * Extract the mantissa of the a floating.
3367 *
3368 * Result is a floating point value with
3369 *
3370 * x / floor(log2(x))
3371 */
3372 LLVMValueRef
3373 lp_build_extract_mantissa(struct lp_build_context *bld,
3374 LLVMValueRef x)
3375 {
3376 LLVMBuilderRef builder = bld->gallivm->builder;
3377 const struct lp_type type = bld->type;
3378 unsigned mantissa = lp_mantissa(type);
3379 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3380 (1ULL << mantissa) - 1);
3381 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3382 LLVMValueRef res;
3383
3384 assert(lp_check_value(bld->type, x));
3385
3386 assert(type.floating);
3387
3388 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3389
3390 /* res = x / 2**ipart */
3391 res = LLVMBuildAnd(builder, x, mantmask, "");
3392 res = LLVMBuildOr(builder, res, one, "");
3393 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3394
3395 return res;
3396 }
3397
3398
3399
3400 /**
3401 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3402 * These coefficients can be generate with
3403 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3404 */
3405 const double lp_build_log2_polynomial[] = {
3406 #if LOG_POLY_DEGREE == 5
3407 2.88539008148777786488L,
3408 0.961796878841293367824L,
3409 0.577058946784739859012L,
3410 0.412914355135828735411L,
3411 0.308591899232910175289L,
3412 0.352376952300281371868L,
3413 #elif LOG_POLY_DEGREE == 4
3414 2.88539009343309178325L,
3415 0.961791550404184197881L,
3416 0.577440339438736392009L,
3417 0.403343858251329912514L,
3418 0.406718052498846252698L,
3419 #elif LOG_POLY_DEGREE == 3
3420 2.88538959748872753838L,
3421 0.961932915889597772928L,
3422 0.571118517972136195241L,
3423 0.493997535084709500285L,
3424 #else
3425 #error
3426 #endif
3427 };
3428
3429 /**
3430 * See http://www.devmaster.net/forums/showthread.php?p=43580
3431 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3432 * http://www.nezumi.demon.co.uk/consult/logx.htm
3433 *
3434 * If handle_edge_cases is true the function will perform computations
3435 * to match the required D3D10+ behavior for each of the edge cases.
3436 * That means that if input is:
3437 * - less than zero (to and including -inf) then NaN will be returned
3438 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3439 * - +infinity, then +infinity will be returned
3440 * - NaN, then NaN will be returned
3441 *
3442 * Those checks are fairly expensive so if you don't need them make sure
3443 * handle_edge_cases is false.
3444 */
3445 void
3446 lp_build_log2_approx(struct lp_build_context *bld,
3447 LLVMValueRef x,
3448 LLVMValueRef *p_exp,
3449 LLVMValueRef *p_floor_log2,
3450 LLVMValueRef *p_log2,
3451 boolean handle_edge_cases)
3452 {
3453 LLVMBuilderRef builder = bld->gallivm->builder;
3454 const struct lp_type type = bld->type;
3455 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3456 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3457
3458 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3459 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3460 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3461
3462 LLVMValueRef i = NULL;
3463 LLVMValueRef y = NULL;
3464 LLVMValueRef z = NULL;
3465 LLVMValueRef exp = NULL;
3466 LLVMValueRef mant = NULL;
3467 LLVMValueRef logexp = NULL;
3468 LLVMValueRef p_z = NULL;
3469 LLVMValueRef res = NULL;
3470
3471 assert(lp_check_value(bld->type, x));
3472
3473 if(p_exp || p_floor_log2 || p_log2) {
3474 /* TODO: optimize the constant case */
3475 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3476 LLVMIsConstant(x)) {
3477 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3478 __FUNCTION__);
3479 }
3480
3481 assert(type.floating && type.width == 32);
3482
3483 /*
3484 * We don't explicitly handle denormalized numbers. They will yield a
3485 * result in the neighbourhood of -127, which appears to be adequate
3486 * enough.
3487 */
3488
3489 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3490
3491 /* exp = (float) exponent(x) */
3492 exp = LLVMBuildAnd(builder, i, expmask, "");
3493 }
3494
3495 if(p_floor_log2 || p_log2) {
3496 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3497 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3498 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3499 }
3500
3501 if (p_log2) {
3502 /* mant = 1 + (float) mantissa(x) */
3503 mant = LLVMBuildAnd(builder, i, mantmask, "");
3504 mant = LLVMBuildOr(builder, mant, one, "");
3505 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3506
3507 /* y = (mant - 1) / (mant + 1) */
3508 y = lp_build_div(bld,
3509 lp_build_sub(bld, mant, bld->one),
3510 lp_build_add(bld, mant, bld->one)
3511 );
3512
3513 /* z = y^2 */
3514 z = lp_build_mul(bld, y, y);
3515
3516 /* compute P(z) */
3517 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3518 ARRAY_SIZE(lp_build_log2_polynomial));
3519
3520 /* y * P(z) + logexp */
3521 res = lp_build_mad(bld, y, p_z, logexp);
3522
3523 if (type.floating && handle_edge_cases) {
3524 LLVMValueRef negmask, infmask, zmask;
3525 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3526 lp_build_const_vec(bld->gallivm, type, 0.0f));
3527 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3528 lp_build_const_vec(bld->gallivm, type, 0.0f));
3529 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3530 lp_build_const_vec(bld->gallivm, type, INFINITY));
3531
3532 /* If x is qual to inf make sure we return inf */
3533 res = lp_build_select(bld, infmask,
3534 lp_build_const_vec(bld->gallivm, type, INFINITY),
3535 res);
3536 /* If x is qual to 0, return -inf */
3537 res = lp_build_select(bld, zmask,
3538 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3539 res);
3540 /* If x is nan or less than 0, return nan */
3541 res = lp_build_select(bld, negmask,
3542 lp_build_const_vec(bld->gallivm, type, NAN),
3543 res);
3544 }
3545 }
3546
3547 if (p_exp) {
3548 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3549 *p_exp = exp;
3550 }
3551
3552 if (p_floor_log2)
3553 *p_floor_log2 = logexp;
3554
3555 if (p_log2)
3556 *p_log2 = res;
3557 }
3558
3559
3560 /*
3561 * log2 implementation which doesn't have special code to
3562 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3563 * the results for those cases are undefined.
3564 */
3565 LLVMValueRef
3566 lp_build_log2(struct lp_build_context *bld,
3567 LLVMValueRef x)
3568 {
3569 LLVMValueRef res;
3570 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3571 return res;
3572 }
3573
3574 /*
3575 * Version of log2 which handles all edge cases.
3576 * Look at documentation of lp_build_log2_approx for
3577 * description of the behavior for each of the edge cases.
3578 */
3579 LLVMValueRef
3580 lp_build_log2_safe(struct lp_build_context *bld,
3581 LLVMValueRef x)
3582 {
3583 LLVMValueRef res;
3584 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3585 return res;
3586 }
3587
3588
3589 /**
3590 * Faster (and less accurate) log2.
3591 *
3592 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3593 *
3594 * Piece-wise linear approximation, with exact results when x is a
3595 * power of two.
3596 *
3597 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3598 */
3599 LLVMValueRef
3600 lp_build_fast_log2(struct lp_build_context *bld,
3601 LLVMValueRef x)
3602 {
3603 LLVMBuilderRef builder = bld->gallivm->builder;
3604 LLVMValueRef ipart;
3605 LLVMValueRef fpart;
3606
3607 assert(lp_check_value(bld->type, x));
3608
3609 assert(bld->type.floating);
3610
3611 /* ipart = floor(log2(x)) - 1 */
3612 ipart = lp_build_extract_exponent(bld, x, -1);
3613 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3614
3615 /* fpart = x / 2**ipart */
3616 fpart = lp_build_extract_mantissa(bld, x);
3617
3618 /* ipart + fpart */
3619 return LLVMBuildFAdd(builder, ipart, fpart, "");
3620 }
3621
3622
3623 /**
3624 * Fast implementation of iround(log2(x)).
3625 *
3626 * Not an approximation -- it should give accurate results all the time.
3627 */
3628 LLVMValueRef
3629 lp_build_ilog2(struct lp_build_context *bld,
3630 LLVMValueRef x)
3631 {
3632 LLVMBuilderRef builder = bld->gallivm->builder;
3633 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3634 LLVMValueRef ipart;
3635
3636 assert(bld->type.floating);
3637
3638 assert(lp_check_value(bld->type, x));
3639
3640 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3641 x = LLVMBuildFMul(builder, x, sqrt2, "");
3642
3643 /* ipart = floor(log2(x) + 0.5) */
3644 ipart = lp_build_extract_exponent(bld, x, 0);
3645
3646 return ipart;
3647 }
3648
3649 LLVMValueRef
3650 lp_build_mod(struct lp_build_context *bld,
3651 LLVMValueRef x,
3652 LLVMValueRef y)
3653 {
3654 LLVMBuilderRef builder = bld->gallivm->builder;
3655 LLVMValueRef res;
3656 const struct lp_type type = bld->type;
3657
3658 assert(lp_check_value(type, x));
3659 assert(lp_check_value(type, y));
3660
3661 if (type.floating)
3662 res = LLVMBuildFRem(builder, x, y, "");
3663 else if (type.sign)
3664 res = LLVMBuildSRem(builder, x, y, "");
3665 else
3666 res = LLVMBuildURem(builder, x, y, "");
3667 return res;
3668 }
3669
3670
3671 /*
3672 * For floating inputs it creates and returns a mask
3673 * which is all 1's for channels which are NaN.
3674 * Channels inside x which are not NaN will be 0.
3675 */
3676 LLVMValueRef
3677 lp_build_isnan(struct lp_build_context *bld,
3678 LLVMValueRef x)
3679 {
3680 LLVMValueRef mask;
3681 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3682
3683 assert(bld->type.floating);
3684 assert(lp_check_value(bld->type, x));
3685
3686 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3687 "isnotnan");
3688 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3689 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3690 return mask;
3691 }
3692
3693 /* Returns all 1's for floating point numbers that are
3694 * finite numbers and returns all zeros for -inf,
3695 * inf and nan's */
3696 LLVMValueRef
3697 lp_build_isfinite(struct lp_build_context *bld,
3698 LLVMValueRef x)
3699 {
3700 LLVMBuilderRef builder = bld->gallivm->builder;
3701 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3702 struct lp_type int_type = lp_int_type(bld->type);
3703 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3704 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3705 0x7f800000);
3706
3707 if (!bld->type.floating) {
3708 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3709 }
3710 assert(bld->type.floating);
3711 assert(lp_check_value(bld->type, x));
3712 assert(bld->type.width == 32);
3713
3714 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3715 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3716 intx, infornan32);
3717 }
3718
3719 /*
3720 * Returns true if the number is nan or inf and false otherwise.
3721 * The input has to be a floating point vector.
3722 */
3723 LLVMValueRef
3724 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3725 const struct lp_type type,
3726 LLVMValueRef x)
3727 {
3728 LLVMBuilderRef builder = gallivm->builder;
3729 struct lp_type int_type = lp_int_type(type);
3730 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3731 0x7f800000);
3732 LLVMValueRef ret;
3733
3734 assert(type.floating);
3735
3736 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3737 ret = LLVMBuildAnd(builder, ret, const0, "");
3738 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3739 ret, const0);
3740
3741 return ret;
3742 }
3743
3744
3745 LLVMValueRef
3746 lp_build_fpstate_get(struct gallivm_state *gallivm)
3747 {
3748 if (util_cpu_caps.has_sse) {
3749 LLVMBuilderRef builder = gallivm->builder;
3750 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3751 gallivm,
3752 LLVMInt32TypeInContext(gallivm->context),
3753 "mxcsr_ptr");
3754 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3755 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3756 lp_build_intrinsic(builder,
3757 "llvm.x86.sse.stmxcsr",
3758 LLVMVoidTypeInContext(gallivm->context),
3759 &mxcsr_ptr8, 1, 0);
3760 return mxcsr_ptr;
3761 }
3762 return 0;
3763 }
3764
3765 void
3766 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3767 boolean zero)
3768 {
3769 if (util_cpu_caps.has_sse) {
3770 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3771 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3772
3773 LLVMBuilderRef builder = gallivm->builder;
3774 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3775 LLVMValueRef mxcsr =
3776 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3777
3778 if (util_cpu_caps.has_daz) {
3779 /* Enable denormals are zero mode */
3780 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3781 }
3782 if (zero) {
3783 mxcsr = LLVMBuildOr(builder, mxcsr,
3784 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3785 } else {
3786 mxcsr = LLVMBuildAnd(builder, mxcsr,
3787 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3788 }
3789
3790 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3791 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3792 }
3793 }
3794
3795 void
3796 lp_build_fpstate_set(struct gallivm_state *gallivm,
3797 LLVMValueRef mxcsr_ptr)
3798 {
3799 if (util_cpu_caps.has_sse) {
3800 LLVMBuilderRef builder = gallivm->builder;
3801 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3802 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3803 lp_build_intrinsic(builder,
3804 "llvm.x86.sse.ldmxcsr",
3805 LLVMVoidTypeInContext(gallivm->context),
3806 &mxcsr_ptr, 1, 0);
3807 }
3808 }