gallivm: fix [IU]MUL_HI regression
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if(a == bld->zero)
545 return b;
546 if(b == bld->zero)
547 return a;
548 if(a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if(bld->type.norm) {
552 const char *intrinsic = NULL;
553
554 if(a == bld->one || b == bld->one)
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if(util_cpu_caps.has_sse2) {
560 if(type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
562 if(type.width == 16)
563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
564 } else if (util_cpu_caps.has_altivec) {
565 if(type.width == 8)
566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
567 if(type.width == 16)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
569 }
570 }
571 if (type.width * type.length == 256) {
572 if(util_cpu_caps.has_avx2) {
573 if(type.width == 8)
574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
575 if(type.width == 16)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
577 }
578 }
579 }
580
581 if (intrinsic)
582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
583 }
584
585 if(type.norm && !type.floating && !type.fixed) {
586 if (type.sign) {
587 uint64_t sign = (uint64_t)1 << (type.width - 1);
588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
590 /* a_clamp_max is the maximum a for positive b,
591 a_clamp_min is the minimum a for negative b. */
592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
595 } else {
596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 }
598 }
599
600 if(LLVMIsConstant(a) && LLVMIsConstant(b))
601 if (type.floating)
602 res = LLVMConstFAdd(a, b);
603 else
604 res = LLVMConstAdd(a, b);
605 else
606 if (type.floating)
607 res = LLVMBuildFAdd(builder, a, b, "");
608 else
609 res = LLVMBuildAdd(builder, a, b, "");
610
611 /* clamp to ceiling of 1.0 */
612 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
614
615 /* XXX clamp to floor of -1 or 0??? */
616
617 return res;
618 }
619
620
621 /** Return the scalar sum of the elements of a.
622 * Should avoid this operation whenever possible.
623 */
624 LLVMValueRef
625 lp_build_horizontal_add(struct lp_build_context *bld,
626 LLVMValueRef a)
627 {
628 LLVMBuilderRef builder = bld->gallivm->builder;
629 const struct lp_type type = bld->type;
630 LLVMValueRef index, res;
631 unsigned i, length;
632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
634 LLVMValueRef vecres, elem2;
635
636 assert(lp_check_value(type, a));
637
638 if (type.length == 1) {
639 return a;
640 }
641
642 assert(!bld->type.norm);
643
644 /*
645 * for byte vectors can do much better with psadbw.
646 * Using repeated shuffle/adds here. Note with multiple vectors
647 * this can be done more efficiently as outlined in the intel
648 * optimization manual.
649 * Note: could cause data rearrangement if used with smaller element
650 * sizes.
651 */
652
653 vecres = a;
654 length = type.length / 2;
655 while (length > 1) {
656 LLVMValueRef vec1, vec2;
657 for (i = 0; i < length; i++) {
658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
660 }
661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
662 LLVMConstVector(shuffles1, length), "");
663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
664 LLVMConstVector(shuffles2, length), "");
665 if (type.floating) {
666 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
667 }
668 else {
669 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
670 }
671 length = length >> 1;
672 }
673
674 /* always have vector of size 2 here */
675 assert(length == 1);
676
677 index = lp_build_const_int32(bld->gallivm, 0);
678 res = LLVMBuildExtractElement(builder, vecres, index, "");
679 index = lp_build_const_int32(bld->gallivm, 1);
680 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
681
682 if (type.floating)
683 res = LLVMBuildFAdd(builder, res, elem2, "");
684 else
685 res = LLVMBuildAdd(builder, res, elem2, "");
686
687 return res;
688 }
689
690 /**
691 * Return the horizontal sums of 4 float vectors as a float4 vector.
692 * This uses the technique as outlined in Intel Optimization Manual.
693 */
694 static LLVMValueRef
695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
696 LLVMValueRef src[4])
697 {
698 struct gallivm_state *gallivm = bld->gallivm;
699 LLVMBuilderRef builder = gallivm->builder;
700 LLVMValueRef shuffles[4];
701 LLVMValueRef tmp[4];
702 LLVMValueRef sumtmp[2], shuftmp[2];
703
704 /* lower half of regs */
705 shuffles[0] = lp_build_const_int32(gallivm, 0);
706 shuffles[1] = lp_build_const_int32(gallivm, 1);
707 shuffles[2] = lp_build_const_int32(gallivm, 4);
708 shuffles[3] = lp_build_const_int32(gallivm, 5);
709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
710 LLVMConstVector(shuffles, 4), "");
711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
712 LLVMConstVector(shuffles, 4), "");
713
714 /* upper half of regs */
715 shuffles[0] = lp_build_const_int32(gallivm, 2);
716 shuffles[1] = lp_build_const_int32(gallivm, 3);
717 shuffles[2] = lp_build_const_int32(gallivm, 6);
718 shuffles[3] = lp_build_const_int32(gallivm, 7);
719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
720 LLVMConstVector(shuffles, 4), "");
721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
722 LLVMConstVector(shuffles, 4), "");
723
724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
726
727 shuffles[0] = lp_build_const_int32(gallivm, 0);
728 shuffles[1] = lp_build_const_int32(gallivm, 2);
729 shuffles[2] = lp_build_const_int32(gallivm, 4);
730 shuffles[3] = lp_build_const_int32(gallivm, 6);
731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
732 LLVMConstVector(shuffles, 4), "");
733
734 shuffles[0] = lp_build_const_int32(gallivm, 1);
735 shuffles[1] = lp_build_const_int32(gallivm, 3);
736 shuffles[2] = lp_build_const_int32(gallivm, 5);
737 shuffles[3] = lp_build_const_int32(gallivm, 7);
738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
739 LLVMConstVector(shuffles, 4), "");
740
741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
742 }
743
744
745 /*
746 * partially horizontally add 2-4 float vectors with length nx4,
747 * i.e. only four adjacent values in each vector will be added,
748 * assuming values are really grouped in 4 which also determines
749 * output order.
750 *
751 * Return a vector of the same length as the initial vectors,
752 * with the excess elements (if any) being undefined.
753 * The element order is independent of number of input vectors.
754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
755 * the output order thus will be
756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
757 */
758 LLVMValueRef
759 lp_build_hadd_partial4(struct lp_build_context *bld,
760 LLVMValueRef vectors[],
761 unsigned num_vecs)
762 {
763 struct gallivm_state *gallivm = bld->gallivm;
764 LLVMBuilderRef builder = gallivm->builder;
765 LLVMValueRef ret_vec;
766 LLVMValueRef tmp[4];
767 const char *intrinsic = NULL;
768
769 assert(num_vecs >= 2 && num_vecs <= 4);
770 assert(bld->type.floating);
771
772 /* only use this with at least 2 vectors, as it is sort of expensive
773 * (depending on cpu) and we always need two horizontal adds anyway,
774 * so a shuffle/add approach might be better.
775 */
776
777 tmp[0] = vectors[0];
778 tmp[1] = vectors[1];
779
780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
782
783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
784 bld->type.length == 4) {
785 intrinsic = "llvm.x86.sse3.hadd.ps";
786 }
787 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
788 bld->type.length == 8) {
789 intrinsic = "llvm.x86.avx.hadd.ps.256";
790 }
791 if (intrinsic) {
792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
793 lp_build_vec_type(gallivm, bld->type),
794 tmp[0], tmp[1]);
795 if (num_vecs > 2) {
796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
797 lp_build_vec_type(gallivm, bld->type),
798 tmp[2], tmp[3]);
799 }
800 else {
801 tmp[1] = tmp[0];
802 }
803 return lp_build_intrinsic_binary(builder, intrinsic,
804 lp_build_vec_type(gallivm, bld->type),
805 tmp[0], tmp[1]);
806 }
807
808 if (bld->type.length == 4) {
809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
810 }
811 else {
812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
813 unsigned j;
814 unsigned num_iter = bld->type.length / 4;
815 struct lp_type parttype = bld->type;
816 parttype.length = 4;
817 for (j = 0; j < num_iter; j++) {
818 LLVMValueRef partsrc[4];
819 unsigned i;
820 for (i = 0; i < 4; i++) {
821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
822 }
823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
824 }
825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
826 }
827 return ret_vec;
828 }
829
830 /**
831 * Generate a - b
832 */
833 LLVMValueRef
834 lp_build_sub(struct lp_build_context *bld,
835 LLVMValueRef a,
836 LLVMValueRef b)
837 {
838 LLVMBuilderRef builder = bld->gallivm->builder;
839 const struct lp_type type = bld->type;
840 LLVMValueRef res;
841
842 assert(lp_check_value(type, a));
843 assert(lp_check_value(type, b));
844
845 if(b == bld->zero)
846 return a;
847 if(a == bld->undef || b == bld->undef)
848 return bld->undef;
849 if(a == b)
850 return bld->zero;
851
852 if(bld->type.norm) {
853 const char *intrinsic = NULL;
854
855 if(b == bld->one)
856 return bld->zero;
857
858 if (!type.floating && !type.fixed) {
859 if (type.width * type.length == 128) {
860 if (util_cpu_caps.has_sse2) {
861 if(type.width == 8)
862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
863 if(type.width == 16)
864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
865 } else if (util_cpu_caps.has_altivec) {
866 if(type.width == 8)
867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
868 if(type.width == 16)
869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
870 }
871 }
872 if (type.width * type.length == 256) {
873 if (util_cpu_caps.has_avx2) {
874 if(type.width == 8)
875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
876 if(type.width == 16)
877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
878 }
879 }
880 }
881
882 if (intrinsic)
883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
884 }
885
886 if(type.norm && !type.floating && !type.fixed) {
887 if (type.sign) {
888 uint64_t sign = (uint64_t)1 << (type.width - 1);
889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
891 /* a_clamp_max is the maximum a for negative b,
892 a_clamp_min is the minimum a for positive b. */
893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
896 } else {
897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
898 }
899 }
900
901 if(LLVMIsConstant(a) && LLVMIsConstant(b))
902 if (type.floating)
903 res = LLVMConstFSub(a, b);
904 else
905 res = LLVMConstSub(a, b);
906 else
907 if (type.floating)
908 res = LLVMBuildFSub(builder, a, b, "");
909 else
910 res = LLVMBuildSub(builder, a, b, "");
911
912 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
914
915 return res;
916 }
917
918
919
920 /**
921 * Normalized multiplication.
922 *
923 * There are several approaches for (using 8-bit normalized multiplication as
924 * an example):
925 *
926 * - alpha plus one
927 *
928 * makes the following approximation to the division (Sree)
929 *
930 * a*b/255 ~= (a*(b + 1)) >> 256
931 *
932 * which is the fastest method that satisfies the following OpenGL criteria of
933 *
934 * 0*0 = 0 and 255*255 = 255
935 *
936 * - geometric series
937 *
938 * takes the geometric series approximation to the division
939 *
940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
941 *
942 * in this case just the first two terms to fit in 16bit arithmetic
943 *
944 * t/255 ~= (t + (t >> 8)) >> 8
945 *
946 * note that just by itself it doesn't satisfies the OpenGL criteria, as
947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
948 * must be used.
949 *
950 * - geometric series plus rounding
951 *
952 * when using a geometric series division instead of truncating the result
953 * use roundoff in the approximation (Jim Blinn)
954 *
955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
956 *
957 * achieving the exact results.
958 *
959 *
960 *
961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
963 * @sa Michael Herf, The "double blend trick", May 2000,
964 * http://www.stereopsis.com/doubleblend.html
965 */
966 static LLVMValueRef
967 lp_build_mul_norm(struct gallivm_state *gallivm,
968 struct lp_type wide_type,
969 LLVMValueRef a, LLVMValueRef b)
970 {
971 LLVMBuilderRef builder = gallivm->builder;
972 struct lp_build_context bld;
973 unsigned n;
974 LLVMValueRef half;
975 LLVMValueRef ab;
976
977 assert(!wide_type.floating);
978 assert(lp_check_value(wide_type, a));
979 assert(lp_check_value(wide_type, b));
980
981 lp_build_context_init(&bld, gallivm, wide_type);
982
983 n = wide_type.width / 2;
984 if (wide_type.sign) {
985 --n;
986 }
987
988 /*
989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
991 */
992
993 /*
994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
995 */
996
997 ab = LLVMBuildMul(builder, a, b, "");
998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
999
1000 /*
1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002 */
1003
1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005 if (wide_type.sign) {
1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008 half = lp_build_select(&bld, sign, minus_half, half);
1009 }
1010 ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012 /* Final division */
1013 ab = lp_build_shr_imm(&bld, ab, n);
1014
1015 return ab;
1016 }
1017
1018 /**
1019 * Generate a * b
1020 */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023 LLVMValueRef a,
1024 LLVMValueRef b)
1025 {
1026 LLVMBuilderRef builder = bld->gallivm->builder;
1027 const struct lp_type type = bld->type;
1028 LLVMValueRef shift;
1029 LLVMValueRef res;
1030
1031 assert(lp_check_value(type, a));
1032 assert(lp_check_value(type, b));
1033
1034 if(a == bld->zero)
1035 return bld->zero;
1036 if(a == bld->one)
1037 return b;
1038 if(b == bld->zero)
1039 return bld->zero;
1040 if(b == bld->one)
1041 return a;
1042 if(a == bld->undef || b == bld->undef)
1043 return bld->undef;
1044
1045 if (!type.floating && !type.fixed && type.norm) {
1046 struct lp_type wide_type = lp_wider_type(type);
1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052 /* PMULLW, PSRLW, PADDW */
1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058 return ab;
1059 }
1060
1061 if(type.fixed)
1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063 else
1064 shift = NULL;
1065
1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067 if (type.floating)
1068 res = LLVMConstFMul(a, b);
1069 else
1070 res = LLVMConstMul(a, b);
1071 if(shift) {
1072 if(type.sign)
1073 res = LLVMConstAShr(res, shift);
1074 else
1075 res = LLVMConstLShr(res, shift);
1076 }
1077 }
1078 else {
1079 if (type.floating)
1080 res = LLVMBuildFMul(builder, a, b, "");
1081 else
1082 res = LLVMBuildMul(builder, a, b, "");
1083 if(shift) {
1084 if(type.sign)
1085 res = LLVMBuildAShr(builder, res, shift, "");
1086 else
1087 res = LLVMBuildLShr(builder, res, shift, "");
1088 }
1089 }
1090
1091 return res;
1092 }
1093
1094 /*
1095 * Widening mul, valid for 32x32 bit -> 64bit only.
1096 * Result is low 32bits, high bits returned in res_hi.
1097 *
1098 * Emits code that is meant to be compiled for the host CPU.
1099 */
1100 LLVMValueRef
1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102 LLVMValueRef a,
1103 LLVMValueRef b,
1104 LLVMValueRef *res_hi)
1105 {
1106 struct gallivm_state *gallivm = bld->gallivm;
1107 LLVMBuilderRef builder = gallivm->builder;
1108
1109 assert(bld->type.width == 32);
1110 assert(bld->type.floating == 0);
1111 assert(bld->type.fixed == 0);
1112 assert(bld->type.norm == 0);
1113
1114 /*
1115 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116 * for x86 simd is atrocious (even if the high bits weren't required),
1117 * trying to handle real 64bit inputs (which of course can't happen due
1118 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119 * apparently llvm does not recognize this widening mul). This includes 6
1120 * (instead of 2) pmuludq plus extra adds and shifts
1121 * The same story applies to signed mul, albeit fixing this requires sse41.
1122 * https://llvm.org/bugs/show_bug.cgi?id=30845
1123 * So, whip up our own code, albeit only for length 4 and 8 (which
1124 * should be good enough)...
1125 */
1126 if ((bld->type.length == 4 || bld->type.length == 8) &&
1127 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128 util_cpu_caps.has_sse4_1)) {
1129 const char *intrinsic = NULL;
1130 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132 struct lp_type type_wide = lp_wider_type(bld->type);
1133 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134 unsigned i;
1135 for (i = 0; i < bld->type.length; i += 2) {
1136 shuf[i] = lp_build_const_int32(gallivm, i+1);
1137 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138 }
1139 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140 aeven = a;
1141 beven = b;
1142 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146 if (bld->type.sign) {
1147 intrinsic = "llvm.x86.avx2.pmul.dq";
1148 } else {
1149 intrinsic = "llvm.x86.avx2.pmulu.dq";
1150 }
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 else {
1157 /* for consistent naming look elsewhere... */
1158 if (bld->type.sign) {
1159 intrinsic = "llvm.x86.sse41.pmuldq";
1160 } else {
1161 intrinsic = "llvm.x86.sse2.pmulu.dq";
1162 }
1163 /*
1164 * XXX If we only have AVX but not AVX2 this is a pain.
1165 * lp_build_intrinsic_binary_anylength() can't handle it
1166 * (due to src and dst type not being identical).
1167 */
1168 if (bld->type.length == 8) {
1169 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171 LLVMValueRef muleven2[2], mulodd2[2];
1172 struct lp_type type_wide_half = type_wide;
1173 LLVMTypeRef wtype_half;
1174 type_wide_half.length = 2;
1175 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185 wtype_half, aevenlo, bevenlo);
1186 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187 wtype_half, aoddlo, boddlo);
1188 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189 wtype_half, aevenhi, bevenhi);
1190 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191 wtype_half, aoddhi, boddhi);
1192 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195 }
1196 else {
1197 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198 wider_type, aeven, beven);
1199 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200 wider_type, aodd, bodd);
1201 }
1202 }
1203 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206 for (i = 0; i < bld->type.length; i += 2) {
1207 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209 }
1210 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213 for (i = 0; i < bld->type.length; i += 2) {
1214 shuf[i] = lp_build_const_int32(gallivm, i);
1215 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216 }
1217 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219 }
1220 else {
1221 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222 }
1223 }
1224
1225
1226 /*
1227 * Widening mul, valid for 32x32 bit -> 64bit only.
1228 * Result is low 32bits, high bits returned in res_hi.
1229 *
1230 * Emits generic code.
1231 */
1232 LLVMValueRef
1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234 LLVMValueRef a,
1235 LLVMValueRef b,
1236 LLVMValueRef *res_hi)
1237 {
1238 struct gallivm_state *gallivm = bld->gallivm;
1239 LLVMBuilderRef builder = gallivm->builder;
1240 LLVMValueRef tmp;
1241 struct lp_type type_tmp;
1242 LLVMTypeRef wide_type, cast_type;
1243
1244 type_tmp = bld->type;
1245 type_tmp.width *= 2;
1246 wide_type = lp_build_vec_type(gallivm, type_tmp);
1247 type_tmp = bld->type;
1248 type_tmp.length *= 2;
1249 cast_type = lp_build_vec_type(gallivm, type_tmp);
1250
1251 if (bld->type.sign) {
1252 a = LLVMBuildSExt(builder, a, wide_type, "");
1253 b = LLVMBuildSExt(builder, b, wide_type, "");
1254 } else {
1255 a = LLVMBuildZExt(builder, a, wide_type, "");
1256 b = LLVMBuildZExt(builder, b, wide_type, "");
1257 }
1258 tmp = LLVMBuildMul(builder, a, b, "");
1259 tmp = LLVMBuildBitCast(builder, tmp, cast_type, "");
1260 *res_hi = lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 1);
1261 return lp_build_uninterleave1(gallivm, bld->type.length * 2, tmp, 0);
1262 }
1263
1264
1265 /* a * b + c */
1266 LLVMValueRef
1267 lp_build_mad(struct lp_build_context *bld,
1268 LLVMValueRef a,
1269 LLVMValueRef b,
1270 LLVMValueRef c)
1271 {
1272 const struct lp_type type = bld->type;
1273 if (type.floating) {
1274 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1275 } else {
1276 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1277 }
1278 }
1279
1280
1281 /**
1282 * Small vector x scale multiplication optimization.
1283 */
1284 LLVMValueRef
1285 lp_build_mul_imm(struct lp_build_context *bld,
1286 LLVMValueRef a,
1287 int b)
1288 {
1289 LLVMBuilderRef builder = bld->gallivm->builder;
1290 LLVMValueRef factor;
1291
1292 assert(lp_check_value(bld->type, a));
1293
1294 if(b == 0)
1295 return bld->zero;
1296
1297 if(b == 1)
1298 return a;
1299
1300 if(b == -1)
1301 return lp_build_negate(bld, a);
1302
1303 if(b == 2 && bld->type.floating)
1304 return lp_build_add(bld, a, a);
1305
1306 if(util_is_power_of_two(b)) {
1307 unsigned shift = ffs(b) - 1;
1308
1309 if(bld->type.floating) {
1310 #if 0
1311 /*
1312 * Power of two multiplication by directly manipulating the exponent.
1313 *
1314 * XXX: This might not be always faster, it will introduce a small error
1315 * for multiplication by zero, and it will produce wrong results
1316 * for Inf and NaN.
1317 */
1318 unsigned mantissa = lp_mantissa(bld->type);
1319 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1320 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1321 a = LLVMBuildAdd(builder, a, factor, "");
1322 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1323 return a;
1324 #endif
1325 }
1326 else {
1327 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1328 return LLVMBuildShl(builder, a, factor, "");
1329 }
1330 }
1331
1332 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1333 return lp_build_mul(bld, a, factor);
1334 }
1335
1336
1337 /**
1338 * Generate a / b
1339 */
1340 LLVMValueRef
1341 lp_build_div(struct lp_build_context *bld,
1342 LLVMValueRef a,
1343 LLVMValueRef b)
1344 {
1345 LLVMBuilderRef builder = bld->gallivm->builder;
1346 const struct lp_type type = bld->type;
1347
1348 assert(lp_check_value(type, a));
1349 assert(lp_check_value(type, b));
1350
1351 if(a == bld->zero)
1352 return bld->zero;
1353 if(a == bld->one && type.floating)
1354 return lp_build_rcp(bld, b);
1355 if(b == bld->zero)
1356 return bld->undef;
1357 if(b == bld->one)
1358 return a;
1359 if(a == bld->undef || b == bld->undef)
1360 return bld->undef;
1361
1362 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1363 if (type.floating)
1364 return LLVMConstFDiv(a, b);
1365 else if (type.sign)
1366 return LLVMConstSDiv(a, b);
1367 else
1368 return LLVMConstUDiv(a, b);
1369 }
1370
1371 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1372 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1373 type.floating)
1374 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1375
1376 if (type.floating)
1377 return LLVMBuildFDiv(builder, a, b, "");
1378 else if (type.sign)
1379 return LLVMBuildSDiv(builder, a, b, "");
1380 else
1381 return LLVMBuildUDiv(builder, a, b, "");
1382 }
1383
1384
1385 /**
1386 * Linear interpolation helper.
1387 *
1388 * @param normalized whether we are interpolating normalized values,
1389 * encoded in normalized integers, twice as wide.
1390 *
1391 * @sa http://www.stereopsis.com/doubleblend.html
1392 */
1393 static inline LLVMValueRef
1394 lp_build_lerp_simple(struct lp_build_context *bld,
1395 LLVMValueRef x,
1396 LLVMValueRef v0,
1397 LLVMValueRef v1,
1398 unsigned flags)
1399 {
1400 unsigned half_width = bld->type.width/2;
1401 LLVMBuilderRef builder = bld->gallivm->builder;
1402 LLVMValueRef delta;
1403 LLVMValueRef res;
1404
1405 assert(lp_check_value(bld->type, x));
1406 assert(lp_check_value(bld->type, v0));
1407 assert(lp_check_value(bld->type, v1));
1408
1409 delta = lp_build_sub(bld, v1, v0);
1410
1411 if (bld->type.floating) {
1412 assert(flags == 0);
1413 return lp_build_mad(bld, x, delta, v0);
1414 }
1415
1416 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1417 if (!bld->type.sign) {
1418 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1419 /*
1420 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1421 * most-significant-bit to the lowest-significant-bit, so that
1422 * later we can just divide by 2**n instead of 2**n - 1.
1423 */
1424
1425 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1426 }
1427
1428 /* (x * delta) >> n */
1429 res = lp_build_mul(bld, x, delta);
1430 res = lp_build_shr_imm(bld, res, half_width);
1431 } else {
1432 /*
1433 * The rescaling trick above doesn't work for signed numbers, so
1434 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1435 * instead.
1436 */
1437 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1438 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1439 }
1440 } else {
1441 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1442 res = lp_build_mul(bld, x, delta);
1443 }
1444
1445 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1446 /*
1447 * At this point both res and v0 only use the lower half of the bits,
1448 * the rest is zero. Instead of add / mask, do add with half wide type.
1449 */
1450 struct lp_type narrow_type;
1451 struct lp_build_context narrow_bld;
1452
1453 memset(&narrow_type, 0, sizeof narrow_type);
1454 narrow_type.sign = bld->type.sign;
1455 narrow_type.width = bld->type.width/2;
1456 narrow_type.length = bld->type.length*2;
1457
1458 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1459 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1460 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1461 res = lp_build_add(&narrow_bld, v0, res);
1462 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1463 } else {
1464 res = lp_build_add(bld, v0, res);
1465
1466 if (bld->type.fixed) {
1467 /*
1468 * We need to mask out the high order bits when lerping 8bit
1469 * normalized colors stored on 16bits
1470 */
1471 /* XXX: This step is necessary for lerping 8bit colors stored on
1472 * 16bits, but it will be wrong for true fixed point use cases.
1473 * Basically we need a more powerful lp_type, capable of further
1474 * distinguishing the values interpretation from the value storage.
1475 */
1476 LLVMValueRef low_bits;
1477 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1478 res = LLVMBuildAnd(builder, res, low_bits, "");
1479 }
1480 }
1481
1482 return res;
1483 }
1484
1485
1486 /**
1487 * Linear interpolation.
1488 */
1489 LLVMValueRef
1490 lp_build_lerp(struct lp_build_context *bld,
1491 LLVMValueRef x,
1492 LLVMValueRef v0,
1493 LLVMValueRef v1,
1494 unsigned flags)
1495 {
1496 const struct lp_type type = bld->type;
1497 LLVMValueRef res;
1498
1499 assert(lp_check_value(type, x));
1500 assert(lp_check_value(type, v0));
1501 assert(lp_check_value(type, v1));
1502
1503 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1504
1505 if (type.norm) {
1506 struct lp_type wide_type;
1507 struct lp_build_context wide_bld;
1508 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1509
1510 assert(type.length >= 2);
1511
1512 /*
1513 * Create a wider integer type, enough to hold the
1514 * intermediate result of the multiplication.
1515 */
1516 memset(&wide_type, 0, sizeof wide_type);
1517 wide_type.sign = type.sign;
1518 wide_type.width = type.width*2;
1519 wide_type.length = type.length/2;
1520
1521 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1522
1523 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1524 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1525 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1526
1527 /*
1528 * Lerp both halves.
1529 */
1530
1531 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1532
1533 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1534 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1535
1536 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1537 } else {
1538 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1539 }
1540
1541 return res;
1542 }
1543
1544
1545 /**
1546 * Bilinear interpolation.
1547 *
1548 * Values indices are in v_{yx}.
1549 */
1550 LLVMValueRef
1551 lp_build_lerp_2d(struct lp_build_context *bld,
1552 LLVMValueRef x,
1553 LLVMValueRef y,
1554 LLVMValueRef v00,
1555 LLVMValueRef v01,
1556 LLVMValueRef v10,
1557 LLVMValueRef v11,
1558 unsigned flags)
1559 {
1560 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1561 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1562 return lp_build_lerp(bld, y, v0, v1, flags);
1563 }
1564
1565
1566 LLVMValueRef
1567 lp_build_lerp_3d(struct lp_build_context *bld,
1568 LLVMValueRef x,
1569 LLVMValueRef y,
1570 LLVMValueRef z,
1571 LLVMValueRef v000,
1572 LLVMValueRef v001,
1573 LLVMValueRef v010,
1574 LLVMValueRef v011,
1575 LLVMValueRef v100,
1576 LLVMValueRef v101,
1577 LLVMValueRef v110,
1578 LLVMValueRef v111,
1579 unsigned flags)
1580 {
1581 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1582 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1583 return lp_build_lerp(bld, z, v0, v1, flags);
1584 }
1585
1586
1587 /**
1588 * Generate min(a, b)
1589 * Do checks for special cases but not for nans.
1590 */
1591 LLVMValueRef
1592 lp_build_min(struct lp_build_context *bld,
1593 LLVMValueRef a,
1594 LLVMValueRef b)
1595 {
1596 assert(lp_check_value(bld->type, a));
1597 assert(lp_check_value(bld->type, b));
1598
1599 if(a == bld->undef || b == bld->undef)
1600 return bld->undef;
1601
1602 if(a == b)
1603 return a;
1604
1605 if (bld->type.norm) {
1606 if (!bld->type.sign) {
1607 if (a == bld->zero || b == bld->zero) {
1608 return bld->zero;
1609 }
1610 }
1611 if(a == bld->one)
1612 return b;
1613 if(b == bld->one)
1614 return a;
1615 }
1616
1617 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1618 }
1619
1620
1621 /**
1622 * Generate min(a, b)
1623 * NaN's are handled according to the behavior specified by the
1624 * nan_behavior argument.
1625 */
1626 LLVMValueRef
1627 lp_build_min_ext(struct lp_build_context *bld,
1628 LLVMValueRef a,
1629 LLVMValueRef b,
1630 enum gallivm_nan_behavior nan_behavior)
1631 {
1632 assert(lp_check_value(bld->type, a));
1633 assert(lp_check_value(bld->type, b));
1634
1635 if(a == bld->undef || b == bld->undef)
1636 return bld->undef;
1637
1638 if(a == b)
1639 return a;
1640
1641 if (bld->type.norm) {
1642 if (!bld->type.sign) {
1643 if (a == bld->zero || b == bld->zero) {
1644 return bld->zero;
1645 }
1646 }
1647 if(a == bld->one)
1648 return b;
1649 if(b == bld->one)
1650 return a;
1651 }
1652
1653 return lp_build_min_simple(bld, a, b, nan_behavior);
1654 }
1655
1656 /**
1657 * Generate max(a, b)
1658 * Do checks for special cases, but NaN behavior is undefined.
1659 */
1660 LLVMValueRef
1661 lp_build_max(struct lp_build_context *bld,
1662 LLVMValueRef a,
1663 LLVMValueRef b)
1664 {
1665 assert(lp_check_value(bld->type, a));
1666 assert(lp_check_value(bld->type, b));
1667
1668 if(a == bld->undef || b == bld->undef)
1669 return bld->undef;
1670
1671 if(a == b)
1672 return a;
1673
1674 if(bld->type.norm) {
1675 if(a == bld->one || b == bld->one)
1676 return bld->one;
1677 if (!bld->type.sign) {
1678 if (a == bld->zero) {
1679 return b;
1680 }
1681 if (b == bld->zero) {
1682 return a;
1683 }
1684 }
1685 }
1686
1687 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1688 }
1689
1690
1691 /**
1692 * Generate max(a, b)
1693 * Checks for special cases.
1694 * NaN's are handled according to the behavior specified by the
1695 * nan_behavior argument.
1696 */
1697 LLVMValueRef
1698 lp_build_max_ext(struct lp_build_context *bld,
1699 LLVMValueRef a,
1700 LLVMValueRef b,
1701 enum gallivm_nan_behavior nan_behavior)
1702 {
1703 assert(lp_check_value(bld->type, a));
1704 assert(lp_check_value(bld->type, b));
1705
1706 if(a == bld->undef || b == bld->undef)
1707 return bld->undef;
1708
1709 if(a == b)
1710 return a;
1711
1712 if(bld->type.norm) {
1713 if(a == bld->one || b == bld->one)
1714 return bld->one;
1715 if (!bld->type.sign) {
1716 if (a == bld->zero) {
1717 return b;
1718 }
1719 if (b == bld->zero) {
1720 return a;
1721 }
1722 }
1723 }
1724
1725 return lp_build_max_simple(bld, a, b, nan_behavior);
1726 }
1727
1728 /**
1729 * Generate clamp(a, min, max)
1730 * NaN behavior (for any of a, min, max) is undefined.
1731 * Do checks for special cases.
1732 */
1733 LLVMValueRef
1734 lp_build_clamp(struct lp_build_context *bld,
1735 LLVMValueRef a,
1736 LLVMValueRef min,
1737 LLVMValueRef max)
1738 {
1739 assert(lp_check_value(bld->type, a));
1740 assert(lp_check_value(bld->type, min));
1741 assert(lp_check_value(bld->type, max));
1742
1743 a = lp_build_min(bld, a, max);
1744 a = lp_build_max(bld, a, min);
1745 return a;
1746 }
1747
1748
1749 /**
1750 * Generate clamp(a, 0, 1)
1751 * A NaN will get converted to zero.
1752 */
1753 LLVMValueRef
1754 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1755 LLVMValueRef a)
1756 {
1757 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1758 a = lp_build_min(bld, a, bld->one);
1759 return a;
1760 }
1761
1762
1763 /**
1764 * Generate abs(a)
1765 */
1766 LLVMValueRef
1767 lp_build_abs(struct lp_build_context *bld,
1768 LLVMValueRef a)
1769 {
1770 LLVMBuilderRef builder = bld->gallivm->builder;
1771 const struct lp_type type = bld->type;
1772 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1773
1774 assert(lp_check_value(type, a));
1775
1776 if(!type.sign)
1777 return a;
1778
1779 if(type.floating) {
1780 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1781 /* Workaround llvm.org/PR27332 */
1782 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1783 unsigned long long absMask = ~(1ULL << (type.width - 1));
1784 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1785 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1786 a = LLVMBuildAnd(builder, a, mask, "");
1787 a = LLVMBuildBitCast(builder, a, vec_type, "");
1788 return a;
1789 } else {
1790 char intrinsic[32];
1791 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1792 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1793 }
1794 }
1795
1796 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1797 switch(type.width) {
1798 case 8:
1799 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1800 case 16:
1801 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1802 case 32:
1803 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1804 }
1805 }
1806 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
1807 switch(type.width) {
1808 case 8:
1809 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1810 case 16:
1811 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1812 case 32:
1813 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1814 }
1815 }
1816 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1817 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1818 (type.width == 8 || type.width == 16 || type.width == 32)) {
1819 debug_printf("%s: inefficient code, should split vectors manually\n",
1820 __FUNCTION__);
1821 }
1822
1823 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1824 }
1825
1826
1827 LLVMValueRef
1828 lp_build_negate(struct lp_build_context *bld,
1829 LLVMValueRef a)
1830 {
1831 LLVMBuilderRef builder = bld->gallivm->builder;
1832
1833 assert(lp_check_value(bld->type, a));
1834
1835 if (bld->type.floating)
1836 a = LLVMBuildFNeg(builder, a, "");
1837 else
1838 a = LLVMBuildNeg(builder, a, "");
1839
1840 return a;
1841 }
1842
1843
1844 /** Return -1, 0 or +1 depending on the sign of a */
1845 LLVMValueRef
1846 lp_build_sgn(struct lp_build_context *bld,
1847 LLVMValueRef a)
1848 {
1849 LLVMBuilderRef builder = bld->gallivm->builder;
1850 const struct lp_type type = bld->type;
1851 LLVMValueRef cond;
1852 LLVMValueRef res;
1853
1854 assert(lp_check_value(type, a));
1855
1856 /* Handle non-zero case */
1857 if(!type.sign) {
1858 /* if not zero then sign must be positive */
1859 res = bld->one;
1860 }
1861 else if(type.floating) {
1862 LLVMTypeRef vec_type;
1863 LLVMTypeRef int_type;
1864 LLVMValueRef mask;
1865 LLVMValueRef sign;
1866 LLVMValueRef one;
1867 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1868
1869 int_type = lp_build_int_vec_type(bld->gallivm, type);
1870 vec_type = lp_build_vec_type(bld->gallivm, type);
1871 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1872
1873 /* Take the sign bit and add it to 1 constant */
1874 sign = LLVMBuildBitCast(builder, a, int_type, "");
1875 sign = LLVMBuildAnd(builder, sign, mask, "");
1876 one = LLVMConstBitCast(bld->one, int_type);
1877 res = LLVMBuildOr(builder, sign, one, "");
1878 res = LLVMBuildBitCast(builder, res, vec_type, "");
1879 }
1880 else
1881 {
1882 /* signed int/norm/fixed point */
1883 /* could use psign with sse3 and appropriate vectors here */
1884 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1885 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1886 res = lp_build_select(bld, cond, bld->one, minus_one);
1887 }
1888
1889 /* Handle zero */
1890 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1891 res = lp_build_select(bld, cond, bld->zero, res);
1892
1893 return res;
1894 }
1895
1896
1897 /**
1898 * Set the sign of float vector 'a' according to 'sign'.
1899 * If sign==0, return abs(a).
1900 * If sign==1, return -abs(a);
1901 * Other values for sign produce undefined results.
1902 */
1903 LLVMValueRef
1904 lp_build_set_sign(struct lp_build_context *bld,
1905 LLVMValueRef a, LLVMValueRef sign)
1906 {
1907 LLVMBuilderRef builder = bld->gallivm->builder;
1908 const struct lp_type type = bld->type;
1909 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1910 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1911 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1912 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1913 ~((unsigned long long) 1 << (type.width - 1)));
1914 LLVMValueRef val, res;
1915
1916 assert(type.floating);
1917 assert(lp_check_value(type, a));
1918
1919 /* val = reinterpret_cast<int>(a) */
1920 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1921 /* val = val & mask */
1922 val = LLVMBuildAnd(builder, val, mask, "");
1923 /* sign = sign << shift */
1924 sign = LLVMBuildShl(builder, sign, shift, "");
1925 /* res = val | sign */
1926 res = LLVMBuildOr(builder, val, sign, "");
1927 /* res = reinterpret_cast<float>(res) */
1928 res = LLVMBuildBitCast(builder, res, vec_type, "");
1929
1930 return res;
1931 }
1932
1933
1934 /**
1935 * Convert vector of (or scalar) int to vector of (or scalar) float.
1936 */
1937 LLVMValueRef
1938 lp_build_int_to_float(struct lp_build_context *bld,
1939 LLVMValueRef a)
1940 {
1941 LLVMBuilderRef builder = bld->gallivm->builder;
1942 const struct lp_type type = bld->type;
1943 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1944
1945 assert(type.floating);
1946
1947 return LLVMBuildSIToFP(builder, a, vec_type, "");
1948 }
1949
1950 static boolean
1951 arch_rounding_available(const struct lp_type type)
1952 {
1953 if ((util_cpu_caps.has_sse4_1 &&
1954 (type.length == 1 || type.width*type.length == 128)) ||
1955 (util_cpu_caps.has_avx && type.width*type.length == 256))
1956 return TRUE;
1957 else if ((util_cpu_caps.has_altivec &&
1958 (type.width == 32 && type.length == 4)))
1959 return TRUE;
1960
1961 return FALSE;
1962 }
1963
1964 enum lp_build_round_mode
1965 {
1966 LP_BUILD_ROUND_NEAREST = 0,
1967 LP_BUILD_ROUND_FLOOR = 1,
1968 LP_BUILD_ROUND_CEIL = 2,
1969 LP_BUILD_ROUND_TRUNCATE = 3
1970 };
1971
1972 static inline LLVMValueRef
1973 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1974 LLVMValueRef a)
1975 {
1976 LLVMBuilderRef builder = bld->gallivm->builder;
1977 const struct lp_type type = bld->type;
1978 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1979 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1980 const char *intrinsic;
1981 LLVMValueRef res;
1982
1983 assert(type.floating);
1984 /* using the double precision conversions is a bit more complicated */
1985 assert(type.width == 32);
1986
1987 assert(lp_check_value(type, a));
1988 assert(util_cpu_caps.has_sse2);
1989
1990 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1991 if (type.length == 1) {
1992 LLVMTypeRef vec_type;
1993 LLVMValueRef undef;
1994 LLVMValueRef arg;
1995 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1996
1997 vec_type = LLVMVectorType(bld->elem_type, 4);
1998
1999 intrinsic = "llvm.x86.sse.cvtss2si";
2000
2001 undef = LLVMGetUndef(vec_type);
2002
2003 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2004
2005 res = lp_build_intrinsic_unary(builder, intrinsic,
2006 ret_type, arg);
2007 }
2008 else {
2009 if (type.width* type.length == 128) {
2010 intrinsic = "llvm.x86.sse2.cvtps2dq";
2011 }
2012 else {
2013 assert(type.width*type.length == 256);
2014 assert(util_cpu_caps.has_avx);
2015
2016 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2017 }
2018 res = lp_build_intrinsic_unary(builder, intrinsic,
2019 ret_type, a);
2020 }
2021
2022 return res;
2023 }
2024
2025
2026 /*
2027 */
2028 static inline LLVMValueRef
2029 lp_build_round_altivec(struct lp_build_context *bld,
2030 LLVMValueRef a,
2031 enum lp_build_round_mode mode)
2032 {
2033 LLVMBuilderRef builder = bld->gallivm->builder;
2034 const struct lp_type type = bld->type;
2035 const char *intrinsic = NULL;
2036
2037 assert(type.floating);
2038
2039 assert(lp_check_value(type, a));
2040 assert(util_cpu_caps.has_altivec);
2041
2042 (void)type;
2043
2044 switch (mode) {
2045 case LP_BUILD_ROUND_NEAREST:
2046 intrinsic = "llvm.ppc.altivec.vrfin";
2047 break;
2048 case LP_BUILD_ROUND_FLOOR:
2049 intrinsic = "llvm.ppc.altivec.vrfim";
2050 break;
2051 case LP_BUILD_ROUND_CEIL:
2052 intrinsic = "llvm.ppc.altivec.vrfip";
2053 break;
2054 case LP_BUILD_ROUND_TRUNCATE:
2055 intrinsic = "llvm.ppc.altivec.vrfiz";
2056 break;
2057 }
2058
2059 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2060 }
2061
2062 static inline LLVMValueRef
2063 lp_build_round_arch(struct lp_build_context *bld,
2064 LLVMValueRef a,
2065 enum lp_build_round_mode mode)
2066 {
2067 if (util_cpu_caps.has_sse4_1) {
2068 LLVMBuilderRef builder = bld->gallivm->builder;
2069 const struct lp_type type = bld->type;
2070 const char *intrinsic_root;
2071 char intrinsic[32];
2072
2073 assert(type.floating);
2074 assert(lp_check_value(type, a));
2075 (void)type;
2076
2077 switch (mode) {
2078 case LP_BUILD_ROUND_NEAREST:
2079 intrinsic_root = "llvm.nearbyint";
2080 break;
2081 case LP_BUILD_ROUND_FLOOR:
2082 intrinsic_root = "llvm.floor";
2083 break;
2084 case LP_BUILD_ROUND_CEIL:
2085 intrinsic_root = "llvm.ceil";
2086 break;
2087 case LP_BUILD_ROUND_TRUNCATE:
2088 intrinsic_root = "llvm.trunc";
2089 break;
2090 }
2091
2092 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2093 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2094 }
2095 else /* (util_cpu_caps.has_altivec) */
2096 return lp_build_round_altivec(bld, a, mode);
2097 }
2098
2099 /**
2100 * Return the integer part of a float (vector) value (== round toward zero).
2101 * The returned value is a float (vector).
2102 * Ex: trunc(-1.5) = -1.0
2103 */
2104 LLVMValueRef
2105 lp_build_trunc(struct lp_build_context *bld,
2106 LLVMValueRef a)
2107 {
2108 LLVMBuilderRef builder = bld->gallivm->builder;
2109 const struct lp_type type = bld->type;
2110
2111 assert(type.floating);
2112 assert(lp_check_value(type, a));
2113
2114 if (arch_rounding_available(type)) {
2115 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2116 }
2117 else {
2118 const struct lp_type type = bld->type;
2119 struct lp_type inttype;
2120 struct lp_build_context intbld;
2121 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2122 LLVMValueRef trunc, res, anosign, mask;
2123 LLVMTypeRef int_vec_type = bld->int_vec_type;
2124 LLVMTypeRef vec_type = bld->vec_type;
2125
2126 assert(type.width == 32); /* might want to handle doubles at some point */
2127
2128 inttype = type;
2129 inttype.floating = 0;
2130 lp_build_context_init(&intbld, bld->gallivm, inttype);
2131
2132 /* round by truncation */
2133 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2134 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2135
2136 /* mask out sign bit */
2137 anosign = lp_build_abs(bld, a);
2138 /*
2139 * mask out all values if anosign > 2^24
2140 * This should work both for large ints (all rounding is no-op for them
2141 * because such floats are always exact) as well as special cases like
2142 * NaNs, Infs (taking advantage of the fact they use max exponent).
2143 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2144 */
2145 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2146 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2147 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2148 return lp_build_select(bld, mask, a, res);
2149 }
2150 }
2151
2152
2153 /**
2154 * Return float (vector) rounded to nearest integer (vector). The returned
2155 * value is a float (vector).
2156 * Ex: round(0.9) = 1.0
2157 * Ex: round(-1.5) = -2.0
2158 */
2159 LLVMValueRef
2160 lp_build_round(struct lp_build_context *bld,
2161 LLVMValueRef a)
2162 {
2163 LLVMBuilderRef builder = bld->gallivm->builder;
2164 const struct lp_type type = bld->type;
2165
2166 assert(type.floating);
2167 assert(lp_check_value(type, a));
2168
2169 if (arch_rounding_available(type)) {
2170 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2171 }
2172 else {
2173 const struct lp_type type = bld->type;
2174 struct lp_type inttype;
2175 struct lp_build_context intbld;
2176 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2177 LLVMValueRef res, anosign, mask;
2178 LLVMTypeRef int_vec_type = bld->int_vec_type;
2179 LLVMTypeRef vec_type = bld->vec_type;
2180
2181 assert(type.width == 32); /* might want to handle doubles at some point */
2182
2183 inttype = type;
2184 inttype.floating = 0;
2185 lp_build_context_init(&intbld, bld->gallivm, inttype);
2186
2187 res = lp_build_iround(bld, a);
2188 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2189
2190 /* mask out sign bit */
2191 anosign = lp_build_abs(bld, a);
2192 /*
2193 * mask out all values if anosign > 2^24
2194 * This should work both for large ints (all rounding is no-op for them
2195 * because such floats are always exact) as well as special cases like
2196 * NaNs, Infs (taking advantage of the fact they use max exponent).
2197 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2198 */
2199 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2200 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2201 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2202 return lp_build_select(bld, mask, a, res);
2203 }
2204 }
2205
2206
2207 /**
2208 * Return floor of float (vector), result is a float (vector)
2209 * Ex: floor(1.1) = 1.0
2210 * Ex: floor(-1.1) = -2.0
2211 */
2212 LLVMValueRef
2213 lp_build_floor(struct lp_build_context *bld,
2214 LLVMValueRef a)
2215 {
2216 LLVMBuilderRef builder = bld->gallivm->builder;
2217 const struct lp_type type = bld->type;
2218
2219 assert(type.floating);
2220 assert(lp_check_value(type, a));
2221
2222 if (arch_rounding_available(type)) {
2223 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2224 }
2225 else {
2226 const struct lp_type type = bld->type;
2227 struct lp_type inttype;
2228 struct lp_build_context intbld;
2229 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2230 LLVMValueRef trunc, res, anosign, mask;
2231 LLVMTypeRef int_vec_type = bld->int_vec_type;
2232 LLVMTypeRef vec_type = bld->vec_type;
2233
2234 if (type.width != 32) {
2235 char intrinsic[32];
2236 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2237 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2238 }
2239
2240 assert(type.width == 32); /* might want to handle doubles at some point */
2241
2242 inttype = type;
2243 inttype.floating = 0;
2244 lp_build_context_init(&intbld, bld->gallivm, inttype);
2245
2246 /* round by truncation */
2247 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2248 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2249
2250 if (type.sign) {
2251 LLVMValueRef tmp;
2252
2253 /*
2254 * fix values if rounding is wrong (for non-special cases)
2255 * - this is the case if trunc > a
2256 */
2257 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2258 /* tmp = trunc > a ? 1.0 : 0.0 */
2259 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2260 tmp = lp_build_and(&intbld, mask, tmp);
2261 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2262 res = lp_build_sub(bld, res, tmp);
2263 }
2264
2265 /* mask out sign bit */
2266 anosign = lp_build_abs(bld, a);
2267 /*
2268 * mask out all values if anosign > 2^24
2269 * This should work both for large ints (all rounding is no-op for them
2270 * because such floats are always exact) as well as special cases like
2271 * NaNs, Infs (taking advantage of the fact they use max exponent).
2272 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2273 */
2274 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2275 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2276 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2277 return lp_build_select(bld, mask, a, res);
2278 }
2279 }
2280
2281
2282 /**
2283 * Return ceiling of float (vector), returning float (vector).
2284 * Ex: ceil( 1.1) = 2.0
2285 * Ex: ceil(-1.1) = -1.0
2286 */
2287 LLVMValueRef
2288 lp_build_ceil(struct lp_build_context *bld,
2289 LLVMValueRef a)
2290 {
2291 LLVMBuilderRef builder = bld->gallivm->builder;
2292 const struct lp_type type = bld->type;
2293
2294 assert(type.floating);
2295 assert(lp_check_value(type, a));
2296
2297 if (arch_rounding_available(type)) {
2298 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2299 }
2300 else {
2301 const struct lp_type type = bld->type;
2302 struct lp_type inttype;
2303 struct lp_build_context intbld;
2304 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2305 LLVMValueRef trunc, res, anosign, mask, tmp;
2306 LLVMTypeRef int_vec_type = bld->int_vec_type;
2307 LLVMTypeRef vec_type = bld->vec_type;
2308
2309 if (type.width != 32) {
2310 char intrinsic[32];
2311 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2312 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2313 }
2314
2315 assert(type.width == 32); /* might want to handle doubles at some point */
2316
2317 inttype = type;
2318 inttype.floating = 0;
2319 lp_build_context_init(&intbld, bld->gallivm, inttype);
2320
2321 /* round by truncation */
2322 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2323 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2324
2325 /*
2326 * fix values if rounding is wrong (for non-special cases)
2327 * - this is the case if trunc < a
2328 */
2329 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2330 /* tmp = trunc < a ? 1.0 : 0.0 */
2331 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2332 tmp = lp_build_and(&intbld, mask, tmp);
2333 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2334 res = lp_build_add(bld, trunc, tmp);
2335
2336 /* mask out sign bit */
2337 anosign = lp_build_abs(bld, a);
2338 /*
2339 * mask out all values if anosign > 2^24
2340 * This should work both for large ints (all rounding is no-op for them
2341 * because such floats are always exact) as well as special cases like
2342 * NaNs, Infs (taking advantage of the fact they use max exponent).
2343 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2344 */
2345 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2346 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2347 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2348 return lp_build_select(bld, mask, a, res);
2349 }
2350 }
2351
2352
2353 /**
2354 * Return fractional part of 'a' computed as a - floor(a)
2355 * Typically used in texture coord arithmetic.
2356 */
2357 LLVMValueRef
2358 lp_build_fract(struct lp_build_context *bld,
2359 LLVMValueRef a)
2360 {
2361 assert(bld->type.floating);
2362 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2363 }
2364
2365
2366 /**
2367 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2368 * against 0.99999(9). (Will also return that value for NaNs.)
2369 */
2370 static inline LLVMValueRef
2371 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2372 {
2373 LLVMValueRef max;
2374
2375 /* this is the largest number smaller than 1.0 representable as float */
2376 max = lp_build_const_vec(bld->gallivm, bld->type,
2377 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2378 return lp_build_min_ext(bld, fract, max,
2379 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2380 }
2381
2382
2383 /**
2384 * Same as lp_build_fract, but guarantees that the result is always smaller
2385 * than one. Will also return the smaller-than-one value for infs, NaNs.
2386 */
2387 LLVMValueRef
2388 lp_build_fract_safe(struct lp_build_context *bld,
2389 LLVMValueRef a)
2390 {
2391 return clamp_fract(bld, lp_build_fract(bld, a));
2392 }
2393
2394
2395 /**
2396 * Return the integer part of a float (vector) value (== round toward zero).
2397 * The returned value is an integer (vector).
2398 * Ex: itrunc(-1.5) = -1
2399 */
2400 LLVMValueRef
2401 lp_build_itrunc(struct lp_build_context *bld,
2402 LLVMValueRef a)
2403 {
2404 LLVMBuilderRef builder = bld->gallivm->builder;
2405 const struct lp_type type = bld->type;
2406 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2407
2408 assert(type.floating);
2409 assert(lp_check_value(type, a));
2410
2411 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2412 }
2413
2414
2415 /**
2416 * Return float (vector) rounded to nearest integer (vector). The returned
2417 * value is an integer (vector).
2418 * Ex: iround(0.9) = 1
2419 * Ex: iround(-1.5) = -2
2420 */
2421 LLVMValueRef
2422 lp_build_iround(struct lp_build_context *bld,
2423 LLVMValueRef a)
2424 {
2425 LLVMBuilderRef builder = bld->gallivm->builder;
2426 const struct lp_type type = bld->type;
2427 LLVMTypeRef int_vec_type = bld->int_vec_type;
2428 LLVMValueRef res;
2429
2430 assert(type.floating);
2431
2432 assert(lp_check_value(type, a));
2433
2434 if ((util_cpu_caps.has_sse2 &&
2435 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2436 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2437 return lp_build_iround_nearest_sse2(bld, a);
2438 }
2439 if (arch_rounding_available(type)) {
2440 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2441 }
2442 else {
2443 LLVMValueRef half;
2444
2445 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2446
2447 if (type.sign) {
2448 LLVMTypeRef vec_type = bld->vec_type;
2449 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2450 (unsigned long long)1 << (type.width - 1));
2451 LLVMValueRef sign;
2452
2453 /* get sign bit */
2454 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2455 sign = LLVMBuildAnd(builder, sign, mask, "");
2456
2457 /* sign * 0.5 */
2458 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2459 half = LLVMBuildOr(builder, sign, half, "");
2460 half = LLVMBuildBitCast(builder, half, vec_type, "");
2461 }
2462
2463 res = LLVMBuildFAdd(builder, a, half, "");
2464 }
2465
2466 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2467
2468 return res;
2469 }
2470
2471
2472 /**
2473 * Return floor of float (vector), result is an int (vector)
2474 * Ex: ifloor(1.1) = 1.0
2475 * Ex: ifloor(-1.1) = -2.0
2476 */
2477 LLVMValueRef
2478 lp_build_ifloor(struct lp_build_context *bld,
2479 LLVMValueRef a)
2480 {
2481 LLVMBuilderRef builder = bld->gallivm->builder;
2482 const struct lp_type type = bld->type;
2483 LLVMTypeRef int_vec_type = bld->int_vec_type;
2484 LLVMValueRef res;
2485
2486 assert(type.floating);
2487 assert(lp_check_value(type, a));
2488
2489 res = a;
2490 if (type.sign) {
2491 if (arch_rounding_available(type)) {
2492 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2493 }
2494 else {
2495 struct lp_type inttype;
2496 struct lp_build_context intbld;
2497 LLVMValueRef trunc, itrunc, mask;
2498
2499 assert(type.floating);
2500 assert(lp_check_value(type, a));
2501
2502 inttype = type;
2503 inttype.floating = 0;
2504 lp_build_context_init(&intbld, bld->gallivm, inttype);
2505
2506 /* round by truncation */
2507 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2508 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2509
2510 /*
2511 * fix values if rounding is wrong (for non-special cases)
2512 * - this is the case if trunc > a
2513 * The results of doing this with NaNs, very large values etc.
2514 * are undefined but this seems to be the case anyway.
2515 */
2516 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2517 /* cheapie minus one with mask since the mask is minus one / zero */
2518 return lp_build_add(&intbld, itrunc, mask);
2519 }
2520 }
2521
2522 /* round to nearest (toward zero) */
2523 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2524
2525 return res;
2526 }
2527
2528
2529 /**
2530 * Return ceiling of float (vector), returning int (vector).
2531 * Ex: iceil( 1.1) = 2
2532 * Ex: iceil(-1.1) = -1
2533 */
2534 LLVMValueRef
2535 lp_build_iceil(struct lp_build_context *bld,
2536 LLVMValueRef a)
2537 {
2538 LLVMBuilderRef builder = bld->gallivm->builder;
2539 const struct lp_type type = bld->type;
2540 LLVMTypeRef int_vec_type = bld->int_vec_type;
2541 LLVMValueRef res;
2542
2543 assert(type.floating);
2544 assert(lp_check_value(type, a));
2545
2546 if (arch_rounding_available(type)) {
2547 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2548 }
2549 else {
2550 struct lp_type inttype;
2551 struct lp_build_context intbld;
2552 LLVMValueRef trunc, itrunc, mask;
2553
2554 assert(type.floating);
2555 assert(lp_check_value(type, a));
2556
2557 inttype = type;
2558 inttype.floating = 0;
2559 lp_build_context_init(&intbld, bld->gallivm, inttype);
2560
2561 /* round by truncation */
2562 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2563 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2564
2565 /*
2566 * fix values if rounding is wrong (for non-special cases)
2567 * - this is the case if trunc < a
2568 * The results of doing this with NaNs, very large values etc.
2569 * are undefined but this seems to be the case anyway.
2570 */
2571 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2572 /* cheapie plus one with mask since the mask is minus one / zero */
2573 return lp_build_sub(&intbld, itrunc, mask);
2574 }
2575
2576 /* round to nearest (toward zero) */
2577 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2578
2579 return res;
2580 }
2581
2582
2583 /**
2584 * Combined ifloor() & fract().
2585 *
2586 * Preferred to calling the functions separately, as it will ensure that the
2587 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2588 */
2589 void
2590 lp_build_ifloor_fract(struct lp_build_context *bld,
2591 LLVMValueRef a,
2592 LLVMValueRef *out_ipart,
2593 LLVMValueRef *out_fpart)
2594 {
2595 LLVMBuilderRef builder = bld->gallivm->builder;
2596 const struct lp_type type = bld->type;
2597 LLVMValueRef ipart;
2598
2599 assert(type.floating);
2600 assert(lp_check_value(type, a));
2601
2602 if (arch_rounding_available(type)) {
2603 /*
2604 * floor() is easier.
2605 */
2606
2607 ipart = lp_build_floor(bld, a);
2608 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2609 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2610 }
2611 else {
2612 /*
2613 * ifloor() is easier.
2614 */
2615
2616 *out_ipart = lp_build_ifloor(bld, a);
2617 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2618 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2619 }
2620 }
2621
2622
2623 /**
2624 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2625 * always smaller than one.
2626 */
2627 void
2628 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2629 LLVMValueRef a,
2630 LLVMValueRef *out_ipart,
2631 LLVMValueRef *out_fpart)
2632 {
2633 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2634 *out_fpart = clamp_fract(bld, *out_fpart);
2635 }
2636
2637
2638 LLVMValueRef
2639 lp_build_sqrt(struct lp_build_context *bld,
2640 LLVMValueRef a)
2641 {
2642 LLVMBuilderRef builder = bld->gallivm->builder;
2643 const struct lp_type type = bld->type;
2644 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2645 char intrinsic[32];
2646
2647 assert(lp_check_value(type, a));
2648
2649 assert(type.floating);
2650 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2651
2652 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2653 }
2654
2655
2656 /**
2657 * Do one Newton-Raphson step to improve reciprocate precision:
2658 *
2659 * x_{i+1} = x_i * (2 - a * x_i)
2660 *
2661 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2662 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2663 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2664 * halo. It would be necessary to clamp the argument to prevent this.
2665 *
2666 * See also:
2667 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2668 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2669 */
2670 static inline LLVMValueRef
2671 lp_build_rcp_refine(struct lp_build_context *bld,
2672 LLVMValueRef a,
2673 LLVMValueRef rcp_a)
2674 {
2675 LLVMBuilderRef builder = bld->gallivm->builder;
2676 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2677 LLVMValueRef res;
2678
2679 res = LLVMBuildFMul(builder, a, rcp_a, "");
2680 res = LLVMBuildFSub(builder, two, res, "");
2681 res = LLVMBuildFMul(builder, rcp_a, res, "");
2682
2683 return res;
2684 }
2685
2686
2687 LLVMValueRef
2688 lp_build_rcp(struct lp_build_context *bld,
2689 LLVMValueRef a)
2690 {
2691 LLVMBuilderRef builder = bld->gallivm->builder;
2692 const struct lp_type type = bld->type;
2693
2694 assert(lp_check_value(type, a));
2695
2696 if(a == bld->zero)
2697 return bld->undef;
2698 if(a == bld->one)
2699 return bld->one;
2700 if(a == bld->undef)
2701 return bld->undef;
2702
2703 assert(type.floating);
2704
2705 if(LLVMIsConstant(a))
2706 return LLVMConstFDiv(bld->one, a);
2707
2708 /*
2709 * We don't use RCPPS because:
2710 * - it only has 10bits of precision
2711 * - it doesn't even get the reciprocate of 1.0 exactly
2712 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2713 * - for recent processors the benefit over DIVPS is marginal, a case
2714 * dependent
2715 *
2716 * We could still use it on certain processors if benchmarks show that the
2717 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2718 * particular uses that require less workarounds.
2719 */
2720
2721 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2722 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2723 const unsigned num_iterations = 0;
2724 LLVMValueRef res;
2725 unsigned i;
2726 const char *intrinsic = NULL;
2727
2728 if (type.length == 4) {
2729 intrinsic = "llvm.x86.sse.rcp.ps";
2730 }
2731 else {
2732 intrinsic = "llvm.x86.avx.rcp.ps.256";
2733 }
2734
2735 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2736
2737 for (i = 0; i < num_iterations; ++i) {
2738 res = lp_build_rcp_refine(bld, a, res);
2739 }
2740
2741 return res;
2742 }
2743
2744 return LLVMBuildFDiv(builder, bld->one, a, "");
2745 }
2746
2747
2748 /**
2749 * Do one Newton-Raphson step to improve rsqrt precision:
2750 *
2751 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2752 *
2753 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2754 */
2755 static inline LLVMValueRef
2756 lp_build_rsqrt_refine(struct lp_build_context *bld,
2757 LLVMValueRef a,
2758 LLVMValueRef rsqrt_a)
2759 {
2760 LLVMBuilderRef builder = bld->gallivm->builder;
2761 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2762 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2763 LLVMValueRef res;
2764
2765 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2766 res = LLVMBuildFMul(builder, a, res, "");
2767 res = LLVMBuildFSub(builder, three, res, "");
2768 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2769 res = LLVMBuildFMul(builder, half, res, "");
2770
2771 return res;
2772 }
2773
2774
2775 /**
2776 * Generate 1/sqrt(a).
2777 * Result is undefined for values < 0, infinity for +0.
2778 */
2779 LLVMValueRef
2780 lp_build_rsqrt(struct lp_build_context *bld,
2781 LLVMValueRef a)
2782 {
2783 const struct lp_type type = bld->type;
2784
2785 assert(lp_check_value(type, a));
2786
2787 assert(type.floating);
2788
2789 /*
2790 * This should be faster but all denormals will end up as infinity.
2791 */
2792 if (0 && lp_build_fast_rsqrt_available(type)) {
2793 const unsigned num_iterations = 1;
2794 LLVMValueRef res;
2795 unsigned i;
2796
2797 /* rsqrt(1.0) != 1.0 here */
2798 res = lp_build_fast_rsqrt(bld, a);
2799
2800 if (num_iterations) {
2801 /*
2802 * Newton-Raphson will result in NaN instead of infinity for zero,
2803 * and NaN instead of zero for infinity.
2804 * Also, need to ensure rsqrt(1.0) == 1.0.
2805 * All numbers smaller than FLT_MIN will result in +infinity
2806 * (rsqrtps treats all denormals as zero).
2807 */
2808 LLVMValueRef cmp;
2809 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2810 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2811
2812 for (i = 0; i < num_iterations; ++i) {
2813 res = lp_build_rsqrt_refine(bld, a, res);
2814 }
2815 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2816 res = lp_build_select(bld, cmp, inf, res);
2817 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2818 res = lp_build_select(bld, cmp, bld->zero, res);
2819 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2820 res = lp_build_select(bld, cmp, bld->one, res);
2821 }
2822
2823 return res;
2824 }
2825
2826 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2827 }
2828
2829 /**
2830 * If there's a fast (inaccurate) rsqrt instruction available
2831 * (caller may want to avoid to call rsqrt_fast if it's not available,
2832 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2833 * unavailable it would result in sqrt/div/mul so obviously
2834 * much better to just call sqrt, skipping both div and mul).
2835 */
2836 boolean
2837 lp_build_fast_rsqrt_available(struct lp_type type)
2838 {
2839 assert(type.floating);
2840
2841 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2842 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2843 return true;
2844 }
2845 return false;
2846 }
2847
2848
2849 /**
2850 * Generate 1/sqrt(a).
2851 * Result is undefined for values < 0, infinity for +0.
2852 * Precision is limited, only ~10 bits guaranteed
2853 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2854 */
2855 LLVMValueRef
2856 lp_build_fast_rsqrt(struct lp_build_context *bld,
2857 LLVMValueRef a)
2858 {
2859 LLVMBuilderRef builder = bld->gallivm->builder;
2860 const struct lp_type type = bld->type;
2861
2862 assert(lp_check_value(type, a));
2863
2864 if (lp_build_fast_rsqrt_available(type)) {
2865 const char *intrinsic = NULL;
2866
2867 if (type.length == 4) {
2868 intrinsic = "llvm.x86.sse.rsqrt.ps";
2869 }
2870 else {
2871 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2872 }
2873 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2874 }
2875 else {
2876 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2877 }
2878 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2879 }
2880
2881
2882 /**
2883 * Generate sin(a) or cos(a) using polynomial approximation.
2884 * TODO: it might be worth recognizing sin and cos using same source
2885 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2886 * would be way cheaper than calculating (nearly) everything twice...
2887 * Not sure it's common enough to be worth bothering however, scs
2888 * opcode could also benefit from calculating both though.
2889 */
2890 static LLVMValueRef
2891 lp_build_sin_or_cos(struct lp_build_context *bld,
2892 LLVMValueRef a,
2893 boolean cos)
2894 {
2895 struct gallivm_state *gallivm = bld->gallivm;
2896 LLVMBuilderRef b = gallivm->builder;
2897 struct lp_type int_type = lp_int_type(bld->type);
2898
2899 /*
2900 * take the absolute value,
2901 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2902 */
2903
2904 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2905 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2906
2907 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2908 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2909
2910 /*
2911 * scale by 4/Pi
2912 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2913 */
2914
2915 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2916 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2917
2918 /*
2919 * store the integer part of y in mm0
2920 * emm2 = _mm_cvttps_epi32(y);
2921 */
2922
2923 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2924
2925 /*
2926 * j=(j+1) & (~1) (see the cephes sources)
2927 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2928 */
2929
2930 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2931 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2932 /*
2933 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2934 */
2935 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2936 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2937
2938 /*
2939 * y = _mm_cvtepi32_ps(emm2);
2940 */
2941 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2942
2943 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2944 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2945 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2946 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2947
2948 /*
2949 * Argument used for poly selection and sign bit determination
2950 * is different for sin vs. cos.
2951 */
2952 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2953 emm2_and;
2954
2955 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2956 LLVMBuildNot(b, emm2_2, ""), ""),
2957 const_29, "sign_bit") :
2958 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2959 LLVMBuildShl(b, emm2_add,
2960 const_29, ""), ""),
2961 sign_mask, "sign_bit");
2962
2963 /*
2964 * get the polynom selection mask
2965 * there is one polynom for 0 <= x <= Pi/4
2966 * and another one for Pi/4<x<=Pi/2
2967 * Both branches will be computed.
2968 *
2969 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2970 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2971 */
2972
2973 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2974 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2975 int_type, PIPE_FUNC_EQUAL,
2976 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2977
2978 /*
2979 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2980 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2981 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2982 */
2983 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2984 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2985 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2986
2987 /*
2988 * The magic pass: "Extended precision modular arithmetic"
2989 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2990 */
2991 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2992 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2993 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2994
2995 /*
2996 * Evaluate the first polynom (0 <= x <= Pi/4)
2997 *
2998 * z = _mm_mul_ps(x,x);
2999 */
3000 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3001
3002 /*
3003 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3004 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3005 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3006 */
3007 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3008 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3009 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3010
3011 /*
3012 * y = *(v4sf*)_ps_coscof_p0;
3013 * y = _mm_mul_ps(y, z);
3014 */
3015 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3016 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3017 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3018 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3019
3020
3021 /*
3022 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3023 * y = _mm_sub_ps(y, tmp);
3024 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3025 */
3026 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3027 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3028 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3029 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3030 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3031
3032 /*
3033 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3034 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3035 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3036 */
3037 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3038 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3039 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3040
3041 /*
3042 * Evaluate the second polynom (Pi/4 <= x <= 0)
3043 *
3044 * y2 = *(v4sf*)_ps_sincof_p0;
3045 * y2 = _mm_mul_ps(y2, z);
3046 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3047 * y2 = _mm_mul_ps(y2, z);
3048 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3049 * y2 = _mm_mul_ps(y2, z);
3050 * y2 = _mm_mul_ps(y2, x);
3051 * y2 = _mm_add_ps(y2, x);
3052 */
3053
3054 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3055 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3056 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3057 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3058
3059 /*
3060 * select the correct result from the two polynoms
3061 * xmm3 = poly_mask;
3062 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3063 * y = _mm_andnot_ps(xmm3, y);
3064 * y = _mm_or_ps(y,y2);
3065 */
3066 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3067 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3068 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3069 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3070 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3071 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3072
3073 /*
3074 * update the sign
3075 * y = _mm_xor_ps(y, sign_bit);
3076 */
3077 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3078 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3079
3080 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3081
3082 /* clamp output to be within [-1, 1] */
3083 y_result = lp_build_clamp(bld, y_result,
3084 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3085 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3086 /* If a is -inf, inf or NaN then return NaN */
3087 y_result = lp_build_select(bld, isfinite, y_result,
3088 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3089 return y_result;
3090 }
3091
3092
3093 /**
3094 * Generate sin(a)
3095 */
3096 LLVMValueRef
3097 lp_build_sin(struct lp_build_context *bld,
3098 LLVMValueRef a)
3099 {
3100 return lp_build_sin_or_cos(bld, a, FALSE);
3101 }
3102
3103
3104 /**
3105 * Generate cos(a)
3106 */
3107 LLVMValueRef
3108 lp_build_cos(struct lp_build_context *bld,
3109 LLVMValueRef a)
3110 {
3111 return lp_build_sin_or_cos(bld, a, TRUE);
3112 }
3113
3114
3115 /**
3116 * Generate pow(x, y)
3117 */
3118 LLVMValueRef
3119 lp_build_pow(struct lp_build_context *bld,
3120 LLVMValueRef x,
3121 LLVMValueRef y)
3122 {
3123 /* TODO: optimize the constant case */
3124 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3125 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3126 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3127 __FUNCTION__);
3128 }
3129
3130 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3131 }
3132
3133
3134 /**
3135 * Generate exp(x)
3136 */
3137 LLVMValueRef
3138 lp_build_exp(struct lp_build_context *bld,
3139 LLVMValueRef x)
3140 {
3141 /* log2(e) = 1/log(2) */
3142 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3143 1.4426950408889634);
3144
3145 assert(lp_check_value(bld->type, x));
3146
3147 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3148 }
3149
3150
3151 /**
3152 * Generate log(x)
3153 * Behavior is undefined with infs, 0s and nans
3154 */
3155 LLVMValueRef
3156 lp_build_log(struct lp_build_context *bld,
3157 LLVMValueRef x)
3158 {
3159 /* log(2) */
3160 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3161 0.69314718055994529);
3162
3163 assert(lp_check_value(bld->type, x));
3164
3165 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3166 }
3167
3168 /**
3169 * Generate log(x) that handles edge cases (infs, 0s and nans)
3170 */
3171 LLVMValueRef
3172 lp_build_log_safe(struct lp_build_context *bld,
3173 LLVMValueRef x)
3174 {
3175 /* log(2) */
3176 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3177 0.69314718055994529);
3178
3179 assert(lp_check_value(bld->type, x));
3180
3181 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3182 }
3183
3184
3185 /**
3186 * Generate polynomial.
3187 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3188 */
3189 LLVMValueRef
3190 lp_build_polynomial(struct lp_build_context *bld,
3191 LLVMValueRef x,
3192 const double *coeffs,
3193 unsigned num_coeffs)
3194 {
3195 const struct lp_type type = bld->type;
3196 LLVMValueRef even = NULL, odd = NULL;
3197 LLVMValueRef x2;
3198 unsigned i;
3199
3200 assert(lp_check_value(bld->type, x));
3201
3202 /* TODO: optimize the constant case */
3203 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3204 LLVMIsConstant(x)) {
3205 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3206 __FUNCTION__);
3207 }
3208
3209 /*
3210 * Calculate odd and even terms seperately to decrease data dependency
3211 * Ex:
3212 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3213 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3214 */
3215 x2 = lp_build_mul(bld, x, x);
3216
3217 for (i = num_coeffs; i--; ) {
3218 LLVMValueRef coeff;
3219
3220 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3221
3222 if (i % 2 == 0) {
3223 if (even)
3224 even = lp_build_mad(bld, x2, even, coeff);
3225 else
3226 even = coeff;
3227 } else {
3228 if (odd)
3229 odd = lp_build_mad(bld, x2, odd, coeff);
3230 else
3231 odd = coeff;
3232 }
3233 }
3234
3235 if (odd)
3236 return lp_build_mad(bld, odd, x, even);
3237 else if (even)
3238 return even;
3239 else
3240 return bld->undef;
3241 }
3242
3243
3244 /**
3245 * Minimax polynomial fit of 2**x, in range [0, 1[
3246 */
3247 const double lp_build_exp2_polynomial[] = {
3248 #if EXP_POLY_DEGREE == 5
3249 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3250 0.693153073200168932794,
3251 0.240153617044375388211,
3252 0.0558263180532956664775,
3253 0.00898934009049466391101,
3254 0.00187757667519147912699
3255 #elif EXP_POLY_DEGREE == 4
3256 1.00000259337069434683,
3257 0.693003834469974940458,
3258 0.24144275689150793076,
3259 0.0520114606103070150235,
3260 0.0135341679161270268764
3261 #elif EXP_POLY_DEGREE == 3
3262 0.999925218562710312959,
3263 0.695833540494823811697,
3264 0.226067155427249155588,
3265 0.0780245226406372992967
3266 #elif EXP_POLY_DEGREE == 2
3267 1.00172476321474503578,
3268 0.657636275736077639316,
3269 0.33718943461968720704
3270 #else
3271 #error
3272 #endif
3273 };
3274
3275
3276 LLVMValueRef
3277 lp_build_exp2(struct lp_build_context *bld,
3278 LLVMValueRef x)
3279 {
3280 LLVMBuilderRef builder = bld->gallivm->builder;
3281 const struct lp_type type = bld->type;
3282 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3283 LLVMValueRef ipart = NULL;
3284 LLVMValueRef fpart = NULL;
3285 LLVMValueRef expipart = NULL;
3286 LLVMValueRef expfpart = NULL;
3287 LLVMValueRef res = NULL;
3288
3289 assert(lp_check_value(bld->type, x));
3290
3291 /* TODO: optimize the constant case */
3292 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3293 LLVMIsConstant(x)) {
3294 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3295 __FUNCTION__);
3296 }
3297
3298 assert(type.floating && type.width == 32);
3299
3300 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3301 * the result is INF and if it's smaller than -126.9 the result is 0 */
3302 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3303 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3304 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3305 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3306
3307 /* ipart = floor(x) */
3308 /* fpart = x - ipart */
3309 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3310
3311 /* expipart = (float) (1 << ipart) */
3312 expipart = LLVMBuildAdd(builder, ipart,
3313 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3314 expipart = LLVMBuildShl(builder, expipart,
3315 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3316 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3317
3318 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3319 ARRAY_SIZE(lp_build_exp2_polynomial));
3320
3321 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3322
3323 return res;
3324 }
3325
3326
3327
3328 /**
3329 * Extract the exponent of a IEEE-754 floating point value.
3330 *
3331 * Optionally apply an integer bias.
3332 *
3333 * Result is an integer value with
3334 *
3335 * ifloor(log2(x)) + bias
3336 */
3337 LLVMValueRef
3338 lp_build_extract_exponent(struct lp_build_context *bld,
3339 LLVMValueRef x,
3340 int bias)
3341 {
3342 LLVMBuilderRef builder = bld->gallivm->builder;
3343 const struct lp_type type = bld->type;
3344 unsigned mantissa = lp_mantissa(type);
3345 LLVMValueRef res;
3346
3347 assert(type.floating);
3348
3349 assert(lp_check_value(bld->type, x));
3350
3351 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3352
3353 res = LLVMBuildLShr(builder, x,
3354 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3355 res = LLVMBuildAnd(builder, res,
3356 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3357 res = LLVMBuildSub(builder, res,
3358 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3359
3360 return res;
3361 }
3362
3363
3364 /**
3365 * Extract the mantissa of the a floating.
3366 *
3367 * Result is a floating point value with
3368 *
3369 * x / floor(log2(x))
3370 */
3371 LLVMValueRef
3372 lp_build_extract_mantissa(struct lp_build_context *bld,
3373 LLVMValueRef x)
3374 {
3375 LLVMBuilderRef builder = bld->gallivm->builder;
3376 const struct lp_type type = bld->type;
3377 unsigned mantissa = lp_mantissa(type);
3378 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3379 (1ULL << mantissa) - 1);
3380 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3381 LLVMValueRef res;
3382
3383 assert(lp_check_value(bld->type, x));
3384
3385 assert(type.floating);
3386
3387 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3388
3389 /* res = x / 2**ipart */
3390 res = LLVMBuildAnd(builder, x, mantmask, "");
3391 res = LLVMBuildOr(builder, res, one, "");
3392 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3393
3394 return res;
3395 }
3396
3397
3398
3399 /**
3400 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3401 * These coefficients can be generate with
3402 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3403 */
3404 const double lp_build_log2_polynomial[] = {
3405 #if LOG_POLY_DEGREE == 5
3406 2.88539008148777786488L,
3407 0.961796878841293367824L,
3408 0.577058946784739859012L,
3409 0.412914355135828735411L,
3410 0.308591899232910175289L,
3411 0.352376952300281371868L,
3412 #elif LOG_POLY_DEGREE == 4
3413 2.88539009343309178325L,
3414 0.961791550404184197881L,
3415 0.577440339438736392009L,
3416 0.403343858251329912514L,
3417 0.406718052498846252698L,
3418 #elif LOG_POLY_DEGREE == 3
3419 2.88538959748872753838L,
3420 0.961932915889597772928L,
3421 0.571118517972136195241L,
3422 0.493997535084709500285L,
3423 #else
3424 #error
3425 #endif
3426 };
3427
3428 /**
3429 * See http://www.devmaster.net/forums/showthread.php?p=43580
3430 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3431 * http://www.nezumi.demon.co.uk/consult/logx.htm
3432 *
3433 * If handle_edge_cases is true the function will perform computations
3434 * to match the required D3D10+ behavior for each of the edge cases.
3435 * That means that if input is:
3436 * - less than zero (to and including -inf) then NaN will be returned
3437 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3438 * - +infinity, then +infinity will be returned
3439 * - NaN, then NaN will be returned
3440 *
3441 * Those checks are fairly expensive so if you don't need them make sure
3442 * handle_edge_cases is false.
3443 */
3444 void
3445 lp_build_log2_approx(struct lp_build_context *bld,
3446 LLVMValueRef x,
3447 LLVMValueRef *p_exp,
3448 LLVMValueRef *p_floor_log2,
3449 LLVMValueRef *p_log2,
3450 boolean handle_edge_cases)
3451 {
3452 LLVMBuilderRef builder = bld->gallivm->builder;
3453 const struct lp_type type = bld->type;
3454 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3455 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3456
3457 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3458 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3459 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3460
3461 LLVMValueRef i = NULL;
3462 LLVMValueRef y = NULL;
3463 LLVMValueRef z = NULL;
3464 LLVMValueRef exp = NULL;
3465 LLVMValueRef mant = NULL;
3466 LLVMValueRef logexp = NULL;
3467 LLVMValueRef p_z = NULL;
3468 LLVMValueRef res = NULL;
3469
3470 assert(lp_check_value(bld->type, x));
3471
3472 if(p_exp || p_floor_log2 || p_log2) {
3473 /* TODO: optimize the constant case */
3474 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3475 LLVMIsConstant(x)) {
3476 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3477 __FUNCTION__);
3478 }
3479
3480 assert(type.floating && type.width == 32);
3481
3482 /*
3483 * We don't explicitly handle denormalized numbers. They will yield a
3484 * result in the neighbourhood of -127, which appears to be adequate
3485 * enough.
3486 */
3487
3488 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3489
3490 /* exp = (float) exponent(x) */
3491 exp = LLVMBuildAnd(builder, i, expmask, "");
3492 }
3493
3494 if(p_floor_log2 || p_log2) {
3495 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3496 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3497 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3498 }
3499
3500 if (p_log2) {
3501 /* mant = 1 + (float) mantissa(x) */
3502 mant = LLVMBuildAnd(builder, i, mantmask, "");
3503 mant = LLVMBuildOr(builder, mant, one, "");
3504 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3505
3506 /* y = (mant - 1) / (mant + 1) */
3507 y = lp_build_div(bld,
3508 lp_build_sub(bld, mant, bld->one),
3509 lp_build_add(bld, mant, bld->one)
3510 );
3511
3512 /* z = y^2 */
3513 z = lp_build_mul(bld, y, y);
3514
3515 /* compute P(z) */
3516 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3517 ARRAY_SIZE(lp_build_log2_polynomial));
3518
3519 /* y * P(z) + logexp */
3520 res = lp_build_mad(bld, y, p_z, logexp);
3521
3522 if (type.floating && handle_edge_cases) {
3523 LLVMValueRef negmask, infmask, zmask;
3524 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3525 lp_build_const_vec(bld->gallivm, type, 0.0f));
3526 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3527 lp_build_const_vec(bld->gallivm, type, 0.0f));
3528 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3529 lp_build_const_vec(bld->gallivm, type, INFINITY));
3530
3531 /* If x is qual to inf make sure we return inf */
3532 res = lp_build_select(bld, infmask,
3533 lp_build_const_vec(bld->gallivm, type, INFINITY),
3534 res);
3535 /* If x is qual to 0, return -inf */
3536 res = lp_build_select(bld, zmask,
3537 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3538 res);
3539 /* If x is nan or less than 0, return nan */
3540 res = lp_build_select(bld, negmask,
3541 lp_build_const_vec(bld->gallivm, type, NAN),
3542 res);
3543 }
3544 }
3545
3546 if (p_exp) {
3547 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3548 *p_exp = exp;
3549 }
3550
3551 if (p_floor_log2)
3552 *p_floor_log2 = logexp;
3553
3554 if (p_log2)
3555 *p_log2 = res;
3556 }
3557
3558
3559 /*
3560 * log2 implementation which doesn't have special code to
3561 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3562 * the results for those cases are undefined.
3563 */
3564 LLVMValueRef
3565 lp_build_log2(struct lp_build_context *bld,
3566 LLVMValueRef x)
3567 {
3568 LLVMValueRef res;
3569 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3570 return res;
3571 }
3572
3573 /*
3574 * Version of log2 which handles all edge cases.
3575 * Look at documentation of lp_build_log2_approx for
3576 * description of the behavior for each of the edge cases.
3577 */
3578 LLVMValueRef
3579 lp_build_log2_safe(struct lp_build_context *bld,
3580 LLVMValueRef x)
3581 {
3582 LLVMValueRef res;
3583 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3584 return res;
3585 }
3586
3587
3588 /**
3589 * Faster (and less accurate) log2.
3590 *
3591 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3592 *
3593 * Piece-wise linear approximation, with exact results when x is a
3594 * power of two.
3595 *
3596 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3597 */
3598 LLVMValueRef
3599 lp_build_fast_log2(struct lp_build_context *bld,
3600 LLVMValueRef x)
3601 {
3602 LLVMBuilderRef builder = bld->gallivm->builder;
3603 LLVMValueRef ipart;
3604 LLVMValueRef fpart;
3605
3606 assert(lp_check_value(bld->type, x));
3607
3608 assert(bld->type.floating);
3609
3610 /* ipart = floor(log2(x)) - 1 */
3611 ipart = lp_build_extract_exponent(bld, x, -1);
3612 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3613
3614 /* fpart = x / 2**ipart */
3615 fpart = lp_build_extract_mantissa(bld, x);
3616
3617 /* ipart + fpart */
3618 return LLVMBuildFAdd(builder, ipart, fpart, "");
3619 }
3620
3621
3622 /**
3623 * Fast implementation of iround(log2(x)).
3624 *
3625 * Not an approximation -- it should give accurate results all the time.
3626 */
3627 LLVMValueRef
3628 lp_build_ilog2(struct lp_build_context *bld,
3629 LLVMValueRef x)
3630 {
3631 LLVMBuilderRef builder = bld->gallivm->builder;
3632 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3633 LLVMValueRef ipart;
3634
3635 assert(bld->type.floating);
3636
3637 assert(lp_check_value(bld->type, x));
3638
3639 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3640 x = LLVMBuildFMul(builder, x, sqrt2, "");
3641
3642 /* ipart = floor(log2(x) + 0.5) */
3643 ipart = lp_build_extract_exponent(bld, x, 0);
3644
3645 return ipart;
3646 }
3647
3648 LLVMValueRef
3649 lp_build_mod(struct lp_build_context *bld,
3650 LLVMValueRef x,
3651 LLVMValueRef y)
3652 {
3653 LLVMBuilderRef builder = bld->gallivm->builder;
3654 LLVMValueRef res;
3655 const struct lp_type type = bld->type;
3656
3657 assert(lp_check_value(type, x));
3658 assert(lp_check_value(type, y));
3659
3660 if (type.floating)
3661 res = LLVMBuildFRem(builder, x, y, "");
3662 else if (type.sign)
3663 res = LLVMBuildSRem(builder, x, y, "");
3664 else
3665 res = LLVMBuildURem(builder, x, y, "");
3666 return res;
3667 }
3668
3669
3670 /*
3671 * For floating inputs it creates and returns a mask
3672 * which is all 1's for channels which are NaN.
3673 * Channels inside x which are not NaN will be 0.
3674 */
3675 LLVMValueRef
3676 lp_build_isnan(struct lp_build_context *bld,
3677 LLVMValueRef x)
3678 {
3679 LLVMValueRef mask;
3680 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3681
3682 assert(bld->type.floating);
3683 assert(lp_check_value(bld->type, x));
3684
3685 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3686 "isnotnan");
3687 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3688 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3689 return mask;
3690 }
3691
3692 /* Returns all 1's for floating point numbers that are
3693 * finite numbers and returns all zeros for -inf,
3694 * inf and nan's */
3695 LLVMValueRef
3696 lp_build_isfinite(struct lp_build_context *bld,
3697 LLVMValueRef x)
3698 {
3699 LLVMBuilderRef builder = bld->gallivm->builder;
3700 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3701 struct lp_type int_type = lp_int_type(bld->type);
3702 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3703 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3704 0x7f800000);
3705
3706 if (!bld->type.floating) {
3707 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3708 }
3709 assert(bld->type.floating);
3710 assert(lp_check_value(bld->type, x));
3711 assert(bld->type.width == 32);
3712
3713 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3714 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3715 intx, infornan32);
3716 }
3717
3718 /*
3719 * Returns true if the number is nan or inf and false otherwise.
3720 * The input has to be a floating point vector.
3721 */
3722 LLVMValueRef
3723 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3724 const struct lp_type type,
3725 LLVMValueRef x)
3726 {
3727 LLVMBuilderRef builder = gallivm->builder;
3728 struct lp_type int_type = lp_int_type(type);
3729 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3730 0x7f800000);
3731 LLVMValueRef ret;
3732
3733 assert(type.floating);
3734
3735 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3736 ret = LLVMBuildAnd(builder, ret, const0, "");
3737 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3738 ret, const0);
3739
3740 return ret;
3741 }
3742
3743
3744 LLVMValueRef
3745 lp_build_fpstate_get(struct gallivm_state *gallivm)
3746 {
3747 if (util_cpu_caps.has_sse) {
3748 LLVMBuilderRef builder = gallivm->builder;
3749 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3750 gallivm,
3751 LLVMInt32TypeInContext(gallivm->context),
3752 "mxcsr_ptr");
3753 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3754 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3755 lp_build_intrinsic(builder,
3756 "llvm.x86.sse.stmxcsr",
3757 LLVMVoidTypeInContext(gallivm->context),
3758 &mxcsr_ptr8, 1, 0);
3759 return mxcsr_ptr;
3760 }
3761 return 0;
3762 }
3763
3764 void
3765 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3766 boolean zero)
3767 {
3768 if (util_cpu_caps.has_sse) {
3769 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3770 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3771
3772 LLVMBuilderRef builder = gallivm->builder;
3773 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3774 LLVMValueRef mxcsr =
3775 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3776
3777 if (util_cpu_caps.has_daz) {
3778 /* Enable denormals are zero mode */
3779 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3780 }
3781 if (zero) {
3782 mxcsr = LLVMBuildOr(builder, mxcsr,
3783 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3784 } else {
3785 mxcsr = LLVMBuildAnd(builder, mxcsr,
3786 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3787 }
3788
3789 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3790 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3791 }
3792 }
3793
3794 void
3795 lp_build_fpstate_set(struct gallivm_state *gallivm,
3796 LLVMValueRef mxcsr_ptr)
3797 {
3798 if (util_cpu_caps.has_sse) {
3799 LLVMBuilderRef builder = gallivm->builder;
3800 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3801 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3802 lp_build_intrinsic(builder,
3803 "llvm.x86.sse.ldmxcsr",
3804 LLVMVoidTypeInContext(gallivm->context),
3805 &mxcsr_ptr, 1, 0);
3806 }
3807 }