Added few more stubs so that control reaches to DestroyDevice().
[mesa.git] / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_cpu_detect.h"
54
55 #include "lp_bld_type.h"
56 #include "lp_bld_const.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_intr.h"
59 #include "lp_bld_logic.h"
60 #include "lp_bld_pack.h"
61 #include "lp_bld_debug.h"
62 #include "lp_bld_bitarit.h"
63 #include "lp_bld_arit.h"
64 #include "lp_bld_flow.h"
65
66 #if defined(PIPE_ARCH_SSE)
67 #include <xmmintrin.h>
68 #endif
69
70 #ifndef _MM_DENORMALS_ZERO_MASK
71 #define _MM_DENORMALS_ZERO_MASK 0x0040
72 #endif
73
74 #ifndef _MM_FLUSH_ZERO_MASK
75 #define _MM_FLUSH_ZERO_MASK 0x8000
76 #endif
77
78 #define EXP_POLY_DEGREE 5
79
80 #define LOG_POLY_DEGREE 4
81
82
83 /**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89 static LLVMValueRef
90 lp_build_min_simple(struct lp_build_context *bld,
91 LLVMValueRef a,
92 LLVMValueRef b,
93 enum gallivm_nan_behavior nan_behavior)
94 {
95 const struct lp_type type = bld->type;
96 const char *intrinsic = NULL;
97 unsigned intr_size = 0;
98 LLVMValueRef cond;
99
100 assert(lp_check_value(type, a));
101 assert(lp_check_value(type, b));
102
103 /* TODO: optimize the constant case */
104
105 if (type.floating && util_cpu_caps.has_sse) {
106 if (type.width == 32) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse.min.ss";
109 intr_size = 128;
110 }
111 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse.min.ps";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.ps.256";
117 intr_size = 256;
118 }
119 }
120 if (type.width == 64 && util_cpu_caps.has_sse2) {
121 if (type.length == 1) {
122 intrinsic = "llvm.x86.sse2.min.sd";
123 intr_size = 128;
124 }
125 else if (type.length == 2 || !util_cpu_caps.has_avx) {
126 intrinsic = "llvm.x86.sse2.min.pd";
127 intr_size = 128;
128 }
129 else {
130 intrinsic = "llvm.x86.avx.min.pd.256";
131 intr_size = 256;
132 }
133 }
134 }
135 else if (type.floating && util_cpu_caps.has_altivec) {
136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139 __FUNCTION__);
140 }
141 if (type.width == 32 && type.length == 4) {
142 intrinsic = "llvm.ppc.altivec.vminfp";
143 intr_size = 128;
144 }
145 } else if (HAVE_LLVM < 0x0309 &&
146 util_cpu_caps.has_avx2 && type.length > 4) {
147 intr_size = 256;
148 switch (type.width) {
149 case 8:
150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151 break;
152 case 16:
153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154 break;
155 case 32:
156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157 break;
158 }
159 } else if (HAVE_LLVM < 0x0309 &&
160 util_cpu_caps.has_sse2 && type.length >= 2) {
161 intr_size = 128;
162 if ((type.width == 8 || type.width == 16) &&
163 (type.width * type.length <= 64) &&
164 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166 __FUNCTION__);
167 }
168 if (type.width == 8 && !type.sign) {
169 intrinsic = "llvm.x86.sse2.pminu.b";
170 }
171 else if (type.width == 16 && type.sign) {
172 intrinsic = "llvm.x86.sse2.pmins.w";
173 }
174 if (util_cpu_caps.has_sse4_1) {
175 if (type.width == 8 && type.sign) {
176 intrinsic = "llvm.x86.sse41.pminsb";
177 }
178 if (type.width == 16 && !type.sign) {
179 intrinsic = "llvm.x86.sse41.pminuw";
180 }
181 if (type.width == 32 && !type.sign) {
182 intrinsic = "llvm.x86.sse41.pminud";
183 }
184 if (type.width == 32 && type.sign) {
185 intrinsic = "llvm.x86.sse41.pminsd";
186 }
187 }
188 } else if (util_cpu_caps.has_altivec) {
189 intr_size = 128;
190 if (type.width == 8) {
191 if (!type.sign) {
192 intrinsic = "llvm.ppc.altivec.vminub";
193 } else {
194 intrinsic = "llvm.ppc.altivec.vminsb";
195 }
196 } else if (type.width == 16) {
197 if (!type.sign) {
198 intrinsic = "llvm.ppc.altivec.vminuh";
199 } else {
200 intrinsic = "llvm.ppc.altivec.vminsh";
201 }
202 } else if (type.width == 32) {
203 if (!type.sign) {
204 intrinsic = "llvm.ppc.altivec.vminuw";
205 } else {
206 intrinsic = "llvm.ppc.altivec.vminsw";
207 }
208 }
209 }
210
211 if (intrinsic) {
212 /* We need to handle nan's for floating point numbers. If one of the
213 * inputs is nan the other should be returned (required by both D3D10+
214 * and OpenCL).
215 * The sse intrinsics return the second operator in case of nan by
216 * default so we need to special code to handle those.
217 */
218 if (util_cpu_caps.has_sse && type.floating &&
219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222 LLVMValueRef isnan, min;
223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224 type,
225 intr_size, a, b);
226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227 isnan = lp_build_isnan(bld, b);
228 return lp_build_select(bld, isnan, a, min);
229 } else {
230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231 isnan = lp_build_isnan(bld, a);
232 return lp_build_select(bld, isnan, a, min);
233 }
234 } else {
235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236 type,
237 intr_size, a, b);
238 }
239 }
240
241 if (type.floating) {
242 switch (nan_behavior) {
243 case GALLIVM_NAN_RETURN_NAN: {
244 LLVMValueRef isnan = lp_build_isnan(bld, b);
245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247 return lp_build_select(bld, cond, a, b);
248 }
249 break;
250 case GALLIVM_NAN_RETURN_OTHER: {
251 LLVMValueRef isnan = lp_build_isnan(bld, a);
252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254 return lp_build_select(bld, cond, a, b);
255 }
256 break;
257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259 return lp_build_select(bld, cond, a, b);
260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262 return lp_build_select(bld, cond, b, a);
263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265 return lp_build_select(bld, cond, a, b);
266 break;
267 default:
268 assert(0);
269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270 return lp_build_select(bld, cond, a, b);
271 }
272 } else {
273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274 return lp_build_select(bld, cond, a, b);
275 }
276 }
277
278
279 LLVMValueRef
280 lp_build_fmuladd(LLVMBuilderRef builder,
281 LLVMValueRef a,
282 LLVMValueRef b,
283 LLVMValueRef c)
284 {
285 LLVMTypeRef type = LLVMTypeOf(a);
286 assert(type == LLVMTypeOf(b));
287 assert(type == LLVMTypeOf(c));
288 if (HAVE_LLVM < 0x0304) {
289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290 * not supported, and instead it falls-back to a C function.
291 */
292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293 }
294 char intrinsic[32];
295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296 LLVMValueRef args[] = { a, b, c };
297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298 }
299
300
301 /**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307 static LLVMValueRef
308 lp_build_max_simple(struct lp_build_context *bld,
309 LLVMValueRef a,
310 LLVMValueRef b,
311 enum gallivm_nan_behavior nan_behavior)
312 {
313 const struct lp_type type = bld->type;
314 const char *intrinsic = NULL;
315 unsigned intr_size = 0;
316 LLVMValueRef cond;
317
318 assert(lp_check_value(type, a));
319 assert(lp_check_value(type, b));
320
321 /* TODO: optimize the constant case */
322
323 if (type.floating && util_cpu_caps.has_sse) {
324 if (type.width == 32) {
325 if (type.length == 1) {
326 intrinsic = "llvm.x86.sse.max.ss";
327 intr_size = 128;
328 }
329 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330 intrinsic = "llvm.x86.sse.max.ps";
331 intr_size = 128;
332 }
333 else {
334 intrinsic = "llvm.x86.avx.max.ps.256";
335 intr_size = 256;
336 }
337 }
338 if (type.width == 64 && util_cpu_caps.has_sse2) {
339 if (type.length == 1) {
340 intrinsic = "llvm.x86.sse2.max.sd";
341 intr_size = 128;
342 }
343 else if (type.length == 2 || !util_cpu_caps.has_avx) {
344 intrinsic = "llvm.x86.sse2.max.pd";
345 intr_size = 128;
346 }
347 else {
348 intrinsic = "llvm.x86.avx.max.pd.256";
349 intr_size = 256;
350 }
351 }
352 }
353 else if (type.floating && util_cpu_caps.has_altivec) {
354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356 debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357 __FUNCTION__);
358 }
359 if (type.width == 32 || type.length == 4) {
360 intrinsic = "llvm.ppc.altivec.vmaxfp";
361 intr_size = 128;
362 }
363 } else if (HAVE_LLVM < 0x0309 &&
364 util_cpu_caps.has_avx2 && type.length > 4) {
365 intr_size = 256;
366 switch (type.width) {
367 case 8:
368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369 break;
370 case 16:
371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372 break;
373 case 32:
374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375 break;
376 }
377 } else if (HAVE_LLVM < 0x0309 &&
378 util_cpu_caps.has_sse2 && type.length >= 2) {
379 intr_size = 128;
380 if ((type.width == 8 || type.width == 16) &&
381 (type.width * type.length <= 64) &&
382 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384 __FUNCTION__);
385 }
386 if (type.width == 8 && !type.sign) {
387 intrinsic = "llvm.x86.sse2.pmaxu.b";
388 intr_size = 128;
389 }
390 else if (type.width == 16 && type.sign) {
391 intrinsic = "llvm.x86.sse2.pmaxs.w";
392 }
393 if (util_cpu_caps.has_sse4_1) {
394 if (type.width == 8 && type.sign) {
395 intrinsic = "llvm.x86.sse41.pmaxsb";
396 }
397 if (type.width == 16 && !type.sign) {
398 intrinsic = "llvm.x86.sse41.pmaxuw";
399 }
400 if (type.width == 32 && !type.sign) {
401 intrinsic = "llvm.x86.sse41.pmaxud";
402 }
403 if (type.width == 32 && type.sign) {
404 intrinsic = "llvm.x86.sse41.pmaxsd";
405 }
406 }
407 } else if (util_cpu_caps.has_altivec) {
408 intr_size = 128;
409 if (type.width == 8) {
410 if (!type.sign) {
411 intrinsic = "llvm.ppc.altivec.vmaxub";
412 } else {
413 intrinsic = "llvm.ppc.altivec.vmaxsb";
414 }
415 } else if (type.width == 16) {
416 if (!type.sign) {
417 intrinsic = "llvm.ppc.altivec.vmaxuh";
418 } else {
419 intrinsic = "llvm.ppc.altivec.vmaxsh";
420 }
421 } else if (type.width == 32) {
422 if (!type.sign) {
423 intrinsic = "llvm.ppc.altivec.vmaxuw";
424 } else {
425 intrinsic = "llvm.ppc.altivec.vmaxsw";
426 }
427 }
428 }
429
430 if (intrinsic) {
431 if (util_cpu_caps.has_sse && type.floating &&
432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435 LLVMValueRef isnan, max;
436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437 type,
438 intr_size, a, b);
439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440 isnan = lp_build_isnan(bld, b);
441 return lp_build_select(bld, isnan, a, max);
442 } else {
443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444 isnan = lp_build_isnan(bld, a);
445 return lp_build_select(bld, isnan, a, max);
446 }
447 } else {
448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449 type,
450 intr_size, a, b);
451 }
452 }
453
454 if (type.floating) {
455 switch (nan_behavior) {
456 case GALLIVM_NAN_RETURN_NAN: {
457 LLVMValueRef isnan = lp_build_isnan(bld, b);
458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460 return lp_build_select(bld, cond, a, b);
461 }
462 break;
463 case GALLIVM_NAN_RETURN_OTHER: {
464 LLVMValueRef isnan = lp_build_isnan(bld, a);
465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467 return lp_build_select(bld, cond, a, b);
468 }
469 break;
470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472 return lp_build_select(bld, cond, a, b);
473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475 return lp_build_select(bld, cond, b, a);
476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478 return lp_build_select(bld, cond, a, b);
479 break;
480 default:
481 assert(0);
482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483 return lp_build_select(bld, cond, a, b);
484 }
485 } else {
486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487 return lp_build_select(bld, cond, a, b);
488 }
489 }
490
491
492 /**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495 LLVMValueRef
496 lp_build_comp(struct lp_build_context *bld,
497 LLVMValueRef a)
498 {
499 LLVMBuilderRef builder = bld->gallivm->builder;
500 const struct lp_type type = bld->type;
501
502 assert(lp_check_value(type, a));
503
504 if(a == bld->one)
505 return bld->zero;
506 if(a == bld->zero)
507 return bld->one;
508
509 if(type.norm && !type.floating && !type.fixed && !type.sign) {
510 if(LLVMIsConstant(a))
511 return LLVMConstNot(a);
512 else
513 return LLVMBuildNot(builder, a, "");
514 }
515
516 if(LLVMIsConstant(a))
517 if (type.floating)
518 return LLVMConstFSub(bld->one, a);
519 else
520 return LLVMConstSub(bld->one, a);
521 else
522 if (type.floating)
523 return LLVMBuildFSub(builder, bld->one, a, "");
524 else
525 return LLVMBuildSub(builder, bld->one, a, "");
526 }
527
528
529 /**
530 * Generate a + b
531 */
532 LLVMValueRef
533 lp_build_add(struct lp_build_context *bld,
534 LLVMValueRef a,
535 LLVMValueRef b)
536 {
537 LLVMBuilderRef builder = bld->gallivm->builder;
538 const struct lp_type type = bld->type;
539 LLVMValueRef res;
540
541 assert(lp_check_value(type, a));
542 assert(lp_check_value(type, b));
543
544 if(a == bld->zero)
545 return b;
546 if(b == bld->zero)
547 return a;
548 if(a == bld->undef || b == bld->undef)
549 return bld->undef;
550
551 if(bld->type.norm) {
552 const char *intrinsic = NULL;
553
554 if(a == bld->one || b == bld->one)
555 return bld->one;
556
557 if (!type.floating && !type.fixed) {
558 if (type.width * type.length == 128) {
559 if(util_cpu_caps.has_sse2) {
560 if(type.width == 8)
561 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
562 if(type.width == 16)
563 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
564 } else if (util_cpu_caps.has_altivec) {
565 if(type.width == 8)
566 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
567 if(type.width == 16)
568 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
569 }
570 }
571 if (type.width * type.length == 256) {
572 if(util_cpu_caps.has_avx2) {
573 if(type.width == 8)
574 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
575 if(type.width == 16)
576 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
577 }
578 }
579 }
580
581 if (intrinsic)
582 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
583 }
584
585 if(type.norm && !type.floating && !type.fixed) {
586 if (type.sign) {
587 uint64_t sign = (uint64_t)1 << (type.width - 1);
588 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
589 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
590 /* a_clamp_max is the maximum a for positive b,
591 a_clamp_min is the minimum a for negative b. */
592 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
593 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
594 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
595 } else {
596 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
597 }
598 }
599
600 if(LLVMIsConstant(a) && LLVMIsConstant(b))
601 if (type.floating)
602 res = LLVMConstFAdd(a, b);
603 else
604 res = LLVMConstAdd(a, b);
605 else
606 if (type.floating)
607 res = LLVMBuildFAdd(builder, a, b, "");
608 else
609 res = LLVMBuildAdd(builder, a, b, "");
610
611 /* clamp to ceiling of 1.0 */
612 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
613 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
614
615 /* XXX clamp to floor of -1 or 0??? */
616
617 return res;
618 }
619
620
621 /** Return the scalar sum of the elements of a.
622 * Should avoid this operation whenever possible.
623 */
624 LLVMValueRef
625 lp_build_horizontal_add(struct lp_build_context *bld,
626 LLVMValueRef a)
627 {
628 LLVMBuilderRef builder = bld->gallivm->builder;
629 const struct lp_type type = bld->type;
630 LLVMValueRef index, res;
631 unsigned i, length;
632 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
633 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
634 LLVMValueRef vecres, elem2;
635
636 assert(lp_check_value(type, a));
637
638 if (type.length == 1) {
639 return a;
640 }
641
642 assert(!bld->type.norm);
643
644 /*
645 * for byte vectors can do much better with psadbw.
646 * Using repeated shuffle/adds here. Note with multiple vectors
647 * this can be done more efficiently as outlined in the intel
648 * optimization manual.
649 * Note: could cause data rearrangement if used with smaller element
650 * sizes.
651 */
652
653 vecres = a;
654 length = type.length / 2;
655 while (length > 1) {
656 LLVMValueRef vec1, vec2;
657 for (i = 0; i < length; i++) {
658 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
659 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
660 }
661 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
662 LLVMConstVector(shuffles1, length), "");
663 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
664 LLVMConstVector(shuffles2, length), "");
665 if (type.floating) {
666 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
667 }
668 else {
669 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
670 }
671 length = length >> 1;
672 }
673
674 /* always have vector of size 2 here */
675 assert(length == 1);
676
677 index = lp_build_const_int32(bld->gallivm, 0);
678 res = LLVMBuildExtractElement(builder, vecres, index, "");
679 index = lp_build_const_int32(bld->gallivm, 1);
680 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
681
682 if (type.floating)
683 res = LLVMBuildFAdd(builder, res, elem2, "");
684 else
685 res = LLVMBuildAdd(builder, res, elem2, "");
686
687 return res;
688 }
689
690 /**
691 * Return the horizontal sums of 4 float vectors as a float4 vector.
692 * This uses the technique as outlined in Intel Optimization Manual.
693 */
694 static LLVMValueRef
695 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
696 LLVMValueRef src[4])
697 {
698 struct gallivm_state *gallivm = bld->gallivm;
699 LLVMBuilderRef builder = gallivm->builder;
700 LLVMValueRef shuffles[4];
701 LLVMValueRef tmp[4];
702 LLVMValueRef sumtmp[2], shuftmp[2];
703
704 /* lower half of regs */
705 shuffles[0] = lp_build_const_int32(gallivm, 0);
706 shuffles[1] = lp_build_const_int32(gallivm, 1);
707 shuffles[2] = lp_build_const_int32(gallivm, 4);
708 shuffles[3] = lp_build_const_int32(gallivm, 5);
709 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
710 LLVMConstVector(shuffles, 4), "");
711 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
712 LLVMConstVector(shuffles, 4), "");
713
714 /* upper half of regs */
715 shuffles[0] = lp_build_const_int32(gallivm, 2);
716 shuffles[1] = lp_build_const_int32(gallivm, 3);
717 shuffles[2] = lp_build_const_int32(gallivm, 6);
718 shuffles[3] = lp_build_const_int32(gallivm, 7);
719 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
720 LLVMConstVector(shuffles, 4), "");
721 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
722 LLVMConstVector(shuffles, 4), "");
723
724 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
725 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
726
727 shuffles[0] = lp_build_const_int32(gallivm, 0);
728 shuffles[1] = lp_build_const_int32(gallivm, 2);
729 shuffles[2] = lp_build_const_int32(gallivm, 4);
730 shuffles[3] = lp_build_const_int32(gallivm, 6);
731 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
732 LLVMConstVector(shuffles, 4), "");
733
734 shuffles[0] = lp_build_const_int32(gallivm, 1);
735 shuffles[1] = lp_build_const_int32(gallivm, 3);
736 shuffles[2] = lp_build_const_int32(gallivm, 5);
737 shuffles[3] = lp_build_const_int32(gallivm, 7);
738 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
739 LLVMConstVector(shuffles, 4), "");
740
741 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
742 }
743
744
745 /*
746 * partially horizontally add 2-4 float vectors with length nx4,
747 * i.e. only four adjacent values in each vector will be added,
748 * assuming values are really grouped in 4 which also determines
749 * output order.
750 *
751 * Return a vector of the same length as the initial vectors,
752 * with the excess elements (if any) being undefined.
753 * The element order is independent of number of input vectors.
754 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
755 * the output order thus will be
756 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
757 */
758 LLVMValueRef
759 lp_build_hadd_partial4(struct lp_build_context *bld,
760 LLVMValueRef vectors[],
761 unsigned num_vecs)
762 {
763 struct gallivm_state *gallivm = bld->gallivm;
764 LLVMBuilderRef builder = gallivm->builder;
765 LLVMValueRef ret_vec;
766 LLVMValueRef tmp[4];
767 const char *intrinsic = NULL;
768
769 assert(num_vecs >= 2 && num_vecs <= 4);
770 assert(bld->type.floating);
771
772 /* only use this with at least 2 vectors, as it is sort of expensive
773 * (depending on cpu) and we always need two horizontal adds anyway,
774 * so a shuffle/add approach might be better.
775 */
776
777 tmp[0] = vectors[0];
778 tmp[1] = vectors[1];
779
780 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
781 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
782
783 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
784 bld->type.length == 4) {
785 intrinsic = "llvm.x86.sse3.hadd.ps";
786 }
787 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
788 bld->type.length == 8) {
789 intrinsic = "llvm.x86.avx.hadd.ps.256";
790 }
791 if (intrinsic) {
792 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
793 lp_build_vec_type(gallivm, bld->type),
794 tmp[0], tmp[1]);
795 if (num_vecs > 2) {
796 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
797 lp_build_vec_type(gallivm, bld->type),
798 tmp[2], tmp[3]);
799 }
800 else {
801 tmp[1] = tmp[0];
802 }
803 return lp_build_intrinsic_binary(builder, intrinsic,
804 lp_build_vec_type(gallivm, bld->type),
805 tmp[0], tmp[1]);
806 }
807
808 if (bld->type.length == 4) {
809 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
810 }
811 else {
812 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
813 unsigned j;
814 unsigned num_iter = bld->type.length / 4;
815 struct lp_type parttype = bld->type;
816 parttype.length = 4;
817 for (j = 0; j < num_iter; j++) {
818 LLVMValueRef partsrc[4];
819 unsigned i;
820 for (i = 0; i < 4; i++) {
821 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
822 }
823 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
824 }
825 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
826 }
827 return ret_vec;
828 }
829
830 /**
831 * Generate a - b
832 */
833 LLVMValueRef
834 lp_build_sub(struct lp_build_context *bld,
835 LLVMValueRef a,
836 LLVMValueRef b)
837 {
838 LLVMBuilderRef builder = bld->gallivm->builder;
839 const struct lp_type type = bld->type;
840 LLVMValueRef res;
841
842 assert(lp_check_value(type, a));
843 assert(lp_check_value(type, b));
844
845 if(b == bld->zero)
846 return a;
847 if(a == bld->undef || b == bld->undef)
848 return bld->undef;
849 if(a == b)
850 return bld->zero;
851
852 if(bld->type.norm) {
853 const char *intrinsic = NULL;
854
855 if(b == bld->one)
856 return bld->zero;
857
858 if (!type.floating && !type.fixed) {
859 if (type.width * type.length == 128) {
860 if (util_cpu_caps.has_sse2) {
861 if(type.width == 8)
862 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
863 if(type.width == 16)
864 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
865 } else if (util_cpu_caps.has_altivec) {
866 if(type.width == 8)
867 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
868 if(type.width == 16)
869 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
870 }
871 }
872 if (type.width * type.length == 256) {
873 if (util_cpu_caps.has_avx2) {
874 if(type.width == 8)
875 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
876 if(type.width == 16)
877 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
878 }
879 }
880 }
881
882 if (intrinsic)
883 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
884 }
885
886 if(type.norm && !type.floating && !type.fixed) {
887 if (type.sign) {
888 uint64_t sign = (uint64_t)1 << (type.width - 1);
889 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
890 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
891 /* a_clamp_max is the maximum a for negative b,
892 a_clamp_min is the minimum a for positive b. */
893 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
894 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
895 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
896 } else {
897 a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
898 }
899 }
900
901 if(LLVMIsConstant(a) && LLVMIsConstant(b))
902 if (type.floating)
903 res = LLVMConstFSub(a, b);
904 else
905 res = LLVMConstSub(a, b);
906 else
907 if (type.floating)
908 res = LLVMBuildFSub(builder, a, b, "");
909 else
910 res = LLVMBuildSub(builder, a, b, "");
911
912 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
913 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
914
915 return res;
916 }
917
918
919
920 /**
921 * Normalized multiplication.
922 *
923 * There are several approaches for (using 8-bit normalized multiplication as
924 * an example):
925 *
926 * - alpha plus one
927 *
928 * makes the following approximation to the division (Sree)
929 *
930 * a*b/255 ~= (a*(b + 1)) >> 256
931 *
932 * which is the fastest method that satisfies the following OpenGL criteria of
933 *
934 * 0*0 = 0 and 255*255 = 255
935 *
936 * - geometric series
937 *
938 * takes the geometric series approximation to the division
939 *
940 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
941 *
942 * in this case just the first two terms to fit in 16bit arithmetic
943 *
944 * t/255 ~= (t + (t >> 8)) >> 8
945 *
946 * note that just by itself it doesn't satisfies the OpenGL criteria, as
947 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
948 * must be used.
949 *
950 * - geometric series plus rounding
951 *
952 * when using a geometric series division instead of truncating the result
953 * use roundoff in the approximation (Jim Blinn)
954 *
955 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
956 *
957 * achieving the exact results.
958 *
959 *
960 *
961 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
962 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
963 * @sa Michael Herf, The "double blend trick", May 2000,
964 * http://www.stereopsis.com/doubleblend.html
965 */
966 static LLVMValueRef
967 lp_build_mul_norm(struct gallivm_state *gallivm,
968 struct lp_type wide_type,
969 LLVMValueRef a, LLVMValueRef b)
970 {
971 LLVMBuilderRef builder = gallivm->builder;
972 struct lp_build_context bld;
973 unsigned n;
974 LLVMValueRef half;
975 LLVMValueRef ab;
976
977 assert(!wide_type.floating);
978 assert(lp_check_value(wide_type, a));
979 assert(lp_check_value(wide_type, b));
980
981 lp_build_context_init(&bld, gallivm, wide_type);
982
983 n = wide_type.width / 2;
984 if (wide_type.sign) {
985 --n;
986 }
987
988 /*
989 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
990 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
991 */
992
993 /*
994 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
995 */
996
997 ab = LLVMBuildMul(builder, a, b, "");
998 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
999
1000 /*
1001 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1002 */
1003
1004 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1005 if (wide_type.sign) {
1006 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1007 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1008 half = lp_build_select(&bld, sign, minus_half, half);
1009 }
1010 ab = LLVMBuildAdd(builder, ab, half, "");
1011
1012 /* Final division */
1013 ab = lp_build_shr_imm(&bld, ab, n);
1014
1015 return ab;
1016 }
1017
1018 /**
1019 * Generate a * b
1020 */
1021 LLVMValueRef
1022 lp_build_mul(struct lp_build_context *bld,
1023 LLVMValueRef a,
1024 LLVMValueRef b)
1025 {
1026 LLVMBuilderRef builder = bld->gallivm->builder;
1027 const struct lp_type type = bld->type;
1028 LLVMValueRef shift;
1029 LLVMValueRef res;
1030
1031 assert(lp_check_value(type, a));
1032 assert(lp_check_value(type, b));
1033
1034 if(a == bld->zero)
1035 return bld->zero;
1036 if(a == bld->one)
1037 return b;
1038 if(b == bld->zero)
1039 return bld->zero;
1040 if(b == bld->one)
1041 return a;
1042 if(a == bld->undef || b == bld->undef)
1043 return bld->undef;
1044
1045 if (!type.floating && !type.fixed && type.norm) {
1046 struct lp_type wide_type = lp_wider_type(type);
1047 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1048
1049 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1050 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1051
1052 /* PMULLW, PSRLW, PADDW */
1053 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1054 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1055
1056 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1057
1058 return ab;
1059 }
1060
1061 if(type.fixed)
1062 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1063 else
1064 shift = NULL;
1065
1066 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1067 if (type.floating)
1068 res = LLVMConstFMul(a, b);
1069 else
1070 res = LLVMConstMul(a, b);
1071 if(shift) {
1072 if(type.sign)
1073 res = LLVMConstAShr(res, shift);
1074 else
1075 res = LLVMConstLShr(res, shift);
1076 }
1077 }
1078 else {
1079 if (type.floating)
1080 res = LLVMBuildFMul(builder, a, b, "");
1081 else
1082 res = LLVMBuildMul(builder, a, b, "");
1083 if(shift) {
1084 if(type.sign)
1085 res = LLVMBuildAShr(builder, res, shift, "");
1086 else
1087 res = LLVMBuildLShr(builder, res, shift, "");
1088 }
1089 }
1090
1091 return res;
1092 }
1093
1094 /*
1095 * Widening mul, valid for 32x32 bit -> 64bit only.
1096 * Result is low 32bits, high bits returned in res_hi.
1097 *
1098 * Emits code that is meant to be compiled for the host CPU.
1099 */
1100 LLVMValueRef
1101 lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1102 LLVMValueRef a,
1103 LLVMValueRef b,
1104 LLVMValueRef *res_hi)
1105 {
1106 struct gallivm_state *gallivm = bld->gallivm;
1107 LLVMBuilderRef builder = gallivm->builder;
1108
1109 assert(bld->type.width == 32);
1110 assert(bld->type.floating == 0);
1111 assert(bld->type.fixed == 0);
1112 assert(bld->type.norm == 0);
1113
1114 /*
1115 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1116 * for x86 simd is atrocious (even if the high bits weren't required),
1117 * trying to handle real 64bit inputs (which of course can't happen due
1118 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1119 * apparently llvm does not recognize this widening mul). This includes 6
1120 * (instead of 2) pmuludq plus extra adds and shifts
1121 * The same story applies to signed mul, albeit fixing this requires sse41.
1122 * https://llvm.org/bugs/show_bug.cgi?id=30845
1123 * So, whip up our own code, albeit only for length 4 and 8 (which
1124 * should be good enough)...
1125 */
1126 if ((bld->type.length == 4 || bld->type.length == 8) &&
1127 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1128 util_cpu_caps.has_sse4_1)) {
1129 const char *intrinsic = NULL;
1130 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1131 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1132 struct lp_type type_wide = lp_wider_type(bld->type);
1133 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1134 unsigned i;
1135 for (i = 0; i < bld->type.length; i += 2) {
1136 shuf[i] = lp_build_const_int32(gallivm, i+1);
1137 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1138 }
1139 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140 aeven = a;
1141 beven = b;
1142 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1143 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1144
1145 if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1146 if (bld->type.sign) {
1147 intrinsic = "llvm.x86.avx2.pmul.dq";
1148 } else {
1149 intrinsic = "llvm.x86.avx2.pmulu.dq";
1150 }
1151 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1152 wider_type, aeven, beven);
1153 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1154 wider_type, aodd, bodd);
1155 }
1156 else {
1157 /* for consistent naming look elsewhere... */
1158 if (bld->type.sign) {
1159 intrinsic = "llvm.x86.sse41.pmuldq";
1160 } else {
1161 intrinsic = "llvm.x86.sse2.pmulu.dq";
1162 }
1163 /*
1164 * XXX If we only have AVX but not AVX2 this is a pain.
1165 * lp_build_intrinsic_binary_anylength() can't handle it
1166 * (due to src and dst type not being identical).
1167 */
1168 if (bld->type.length == 8) {
1169 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1170 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1171 LLVMValueRef muleven2[2], mulodd2[2];
1172 struct lp_type type_wide_half = type_wide;
1173 LLVMTypeRef wtype_half;
1174 type_wide_half.length = 2;
1175 wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1176 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1177 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1178 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1179 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1180 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1181 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1182 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1183 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1184 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1185 wtype_half, aevenlo, bevenlo);
1186 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1187 wtype_half, aoddlo, boddlo);
1188 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1189 wtype_half, aevenhi, bevenhi);
1190 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1191 wtype_half, aoddhi, boddhi);
1192 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1193 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1194
1195 }
1196 else {
1197 muleven = lp_build_intrinsic_binary(builder, intrinsic,
1198 wider_type, aeven, beven);
1199 mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1200 wider_type, aodd, bodd);
1201 }
1202 }
1203 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1204 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1205
1206 for (i = 0; i < bld->type.length; i += 2) {
1207 shuf[i] = lp_build_const_int32(gallivm, i + 1);
1208 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1209 }
1210 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1211 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1212
1213 for (i = 0; i < bld->type.length; i += 2) {
1214 shuf[i] = lp_build_const_int32(gallivm, i);
1215 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1216 }
1217 shuf_vec = LLVMConstVector(shuf, bld->type.length);
1218 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1219 }
1220 else {
1221 return lp_build_mul_32_lohi(bld, a, b, res_hi);
1222 }
1223 }
1224
1225
1226 /*
1227 * Widening mul, valid for 32x32 bit -> 64bit only.
1228 * Result is low 32bits, high bits returned in res_hi.
1229 *
1230 * Emits generic code.
1231 */
1232 LLVMValueRef
1233 lp_build_mul_32_lohi(struct lp_build_context *bld,
1234 LLVMValueRef a,
1235 LLVMValueRef b,
1236 LLVMValueRef *res_hi)
1237 {
1238 struct gallivm_state *gallivm = bld->gallivm;
1239 LLVMBuilderRef builder = gallivm->builder;
1240 LLVMValueRef tmp, shift, res_lo;
1241 struct lp_type type_tmp;
1242 LLVMTypeRef wide_type, narrow_type;
1243
1244 type_tmp = bld->type;
1245 narrow_type = lp_build_vec_type(gallivm, type_tmp);
1246 type_tmp.width *= 2;
1247 wide_type = lp_build_vec_type(gallivm, type_tmp);
1248 shift = lp_build_const_vec(gallivm, type_tmp, 32);
1249
1250 if (bld->type.sign) {
1251 a = LLVMBuildSExt(builder, a, wide_type, "");
1252 b = LLVMBuildSExt(builder, b, wide_type, "");
1253 } else {
1254 a = LLVMBuildZExt(builder, a, wide_type, "");
1255 b = LLVMBuildZExt(builder, b, wide_type, "");
1256 }
1257 tmp = LLVMBuildMul(builder, a, b, "");
1258
1259 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1260
1261 /* Since we truncate anyway, LShr and AShr are equivalent. */
1262 tmp = LLVMBuildLShr(builder, tmp, shift, "");
1263 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1264
1265 return res_lo;
1266 }
1267
1268
1269 /* a * b + c */
1270 LLVMValueRef
1271 lp_build_mad(struct lp_build_context *bld,
1272 LLVMValueRef a,
1273 LLVMValueRef b,
1274 LLVMValueRef c)
1275 {
1276 const struct lp_type type = bld->type;
1277 if (type.floating) {
1278 return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1279 } else {
1280 return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1281 }
1282 }
1283
1284
1285 /**
1286 * Small vector x scale multiplication optimization.
1287 */
1288 LLVMValueRef
1289 lp_build_mul_imm(struct lp_build_context *bld,
1290 LLVMValueRef a,
1291 int b)
1292 {
1293 LLVMBuilderRef builder = bld->gallivm->builder;
1294 LLVMValueRef factor;
1295
1296 assert(lp_check_value(bld->type, a));
1297
1298 if(b == 0)
1299 return bld->zero;
1300
1301 if(b == 1)
1302 return a;
1303
1304 if(b == -1)
1305 return lp_build_negate(bld, a);
1306
1307 if(b == 2 && bld->type.floating)
1308 return lp_build_add(bld, a, a);
1309
1310 if(util_is_power_of_two(b)) {
1311 unsigned shift = ffs(b) - 1;
1312
1313 if(bld->type.floating) {
1314 #if 0
1315 /*
1316 * Power of two multiplication by directly manipulating the exponent.
1317 *
1318 * XXX: This might not be always faster, it will introduce a small error
1319 * for multiplication by zero, and it will produce wrong results
1320 * for Inf and NaN.
1321 */
1322 unsigned mantissa = lp_mantissa(bld->type);
1323 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1324 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1325 a = LLVMBuildAdd(builder, a, factor, "");
1326 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1327 return a;
1328 #endif
1329 }
1330 else {
1331 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1332 return LLVMBuildShl(builder, a, factor, "");
1333 }
1334 }
1335
1336 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1337 return lp_build_mul(bld, a, factor);
1338 }
1339
1340
1341 /**
1342 * Generate a / b
1343 */
1344 LLVMValueRef
1345 lp_build_div(struct lp_build_context *bld,
1346 LLVMValueRef a,
1347 LLVMValueRef b)
1348 {
1349 LLVMBuilderRef builder = bld->gallivm->builder;
1350 const struct lp_type type = bld->type;
1351
1352 assert(lp_check_value(type, a));
1353 assert(lp_check_value(type, b));
1354
1355 if(a == bld->zero)
1356 return bld->zero;
1357 if(a == bld->one && type.floating)
1358 return lp_build_rcp(bld, b);
1359 if(b == bld->zero)
1360 return bld->undef;
1361 if(b == bld->one)
1362 return a;
1363 if(a == bld->undef || b == bld->undef)
1364 return bld->undef;
1365
1366 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1367 if (type.floating)
1368 return LLVMConstFDiv(a, b);
1369 else if (type.sign)
1370 return LLVMConstSDiv(a, b);
1371 else
1372 return LLVMConstUDiv(a, b);
1373 }
1374
1375 /* fast rcp is disabled (just uses div), so makes no sense to try that */
1376 if(FALSE &&
1377 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1378 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1379 type.floating)
1380 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1381
1382 if (type.floating)
1383 return LLVMBuildFDiv(builder, a, b, "");
1384 else if (type.sign)
1385 return LLVMBuildSDiv(builder, a, b, "");
1386 else
1387 return LLVMBuildUDiv(builder, a, b, "");
1388 }
1389
1390
1391 /**
1392 * Linear interpolation helper.
1393 *
1394 * @param normalized whether we are interpolating normalized values,
1395 * encoded in normalized integers, twice as wide.
1396 *
1397 * @sa http://www.stereopsis.com/doubleblend.html
1398 */
1399 static inline LLVMValueRef
1400 lp_build_lerp_simple(struct lp_build_context *bld,
1401 LLVMValueRef x,
1402 LLVMValueRef v0,
1403 LLVMValueRef v1,
1404 unsigned flags)
1405 {
1406 unsigned half_width = bld->type.width/2;
1407 LLVMBuilderRef builder = bld->gallivm->builder;
1408 LLVMValueRef delta;
1409 LLVMValueRef res;
1410
1411 assert(lp_check_value(bld->type, x));
1412 assert(lp_check_value(bld->type, v0));
1413 assert(lp_check_value(bld->type, v1));
1414
1415 delta = lp_build_sub(bld, v1, v0);
1416
1417 if (bld->type.floating) {
1418 assert(flags == 0);
1419 return lp_build_mad(bld, x, delta, v0);
1420 }
1421
1422 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1423 if (!bld->type.sign) {
1424 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1425 /*
1426 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1427 * most-significant-bit to the lowest-significant-bit, so that
1428 * later we can just divide by 2**n instead of 2**n - 1.
1429 */
1430
1431 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1432 }
1433
1434 /* (x * delta) >> n */
1435 res = lp_build_mul(bld, x, delta);
1436 res = lp_build_shr_imm(bld, res, half_width);
1437 } else {
1438 /*
1439 * The rescaling trick above doesn't work for signed numbers, so
1440 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1441 * instead.
1442 */
1443 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1444 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1445 }
1446 } else {
1447 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1448 res = lp_build_mul(bld, x, delta);
1449 }
1450
1451 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1452 /*
1453 * At this point both res and v0 only use the lower half of the bits,
1454 * the rest is zero. Instead of add / mask, do add with half wide type.
1455 */
1456 struct lp_type narrow_type;
1457 struct lp_build_context narrow_bld;
1458
1459 memset(&narrow_type, 0, sizeof narrow_type);
1460 narrow_type.sign = bld->type.sign;
1461 narrow_type.width = bld->type.width/2;
1462 narrow_type.length = bld->type.length*2;
1463
1464 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1465 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1466 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1467 res = lp_build_add(&narrow_bld, v0, res);
1468 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1469 } else {
1470 res = lp_build_add(bld, v0, res);
1471
1472 if (bld->type.fixed) {
1473 /*
1474 * We need to mask out the high order bits when lerping 8bit
1475 * normalized colors stored on 16bits
1476 */
1477 /* XXX: This step is necessary for lerping 8bit colors stored on
1478 * 16bits, but it will be wrong for true fixed point use cases.
1479 * Basically we need a more powerful lp_type, capable of further
1480 * distinguishing the values interpretation from the value storage.
1481 */
1482 LLVMValueRef low_bits;
1483 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1484 res = LLVMBuildAnd(builder, res, low_bits, "");
1485 }
1486 }
1487
1488 return res;
1489 }
1490
1491
1492 /**
1493 * Linear interpolation.
1494 */
1495 LLVMValueRef
1496 lp_build_lerp(struct lp_build_context *bld,
1497 LLVMValueRef x,
1498 LLVMValueRef v0,
1499 LLVMValueRef v1,
1500 unsigned flags)
1501 {
1502 const struct lp_type type = bld->type;
1503 LLVMValueRef res;
1504
1505 assert(lp_check_value(type, x));
1506 assert(lp_check_value(type, v0));
1507 assert(lp_check_value(type, v1));
1508
1509 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1510
1511 if (type.norm) {
1512 struct lp_type wide_type;
1513 struct lp_build_context wide_bld;
1514 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1515
1516 assert(type.length >= 2);
1517
1518 /*
1519 * Create a wider integer type, enough to hold the
1520 * intermediate result of the multiplication.
1521 */
1522 memset(&wide_type, 0, sizeof wide_type);
1523 wide_type.sign = type.sign;
1524 wide_type.width = type.width*2;
1525 wide_type.length = type.length/2;
1526
1527 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1528
1529 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh);
1530 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1531 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1532
1533 /*
1534 * Lerp both halves.
1535 */
1536
1537 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1538
1539 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1540 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1541
1542 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1543 } else {
1544 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1545 }
1546
1547 return res;
1548 }
1549
1550
1551 /**
1552 * Bilinear interpolation.
1553 *
1554 * Values indices are in v_{yx}.
1555 */
1556 LLVMValueRef
1557 lp_build_lerp_2d(struct lp_build_context *bld,
1558 LLVMValueRef x,
1559 LLVMValueRef y,
1560 LLVMValueRef v00,
1561 LLVMValueRef v01,
1562 LLVMValueRef v10,
1563 LLVMValueRef v11,
1564 unsigned flags)
1565 {
1566 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1567 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1568 return lp_build_lerp(bld, y, v0, v1, flags);
1569 }
1570
1571
1572 LLVMValueRef
1573 lp_build_lerp_3d(struct lp_build_context *bld,
1574 LLVMValueRef x,
1575 LLVMValueRef y,
1576 LLVMValueRef z,
1577 LLVMValueRef v000,
1578 LLVMValueRef v001,
1579 LLVMValueRef v010,
1580 LLVMValueRef v011,
1581 LLVMValueRef v100,
1582 LLVMValueRef v101,
1583 LLVMValueRef v110,
1584 LLVMValueRef v111,
1585 unsigned flags)
1586 {
1587 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1588 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1589 return lp_build_lerp(bld, z, v0, v1, flags);
1590 }
1591
1592
1593 /**
1594 * Generate min(a, b)
1595 * Do checks for special cases but not for nans.
1596 */
1597 LLVMValueRef
1598 lp_build_min(struct lp_build_context *bld,
1599 LLVMValueRef a,
1600 LLVMValueRef b)
1601 {
1602 assert(lp_check_value(bld->type, a));
1603 assert(lp_check_value(bld->type, b));
1604
1605 if(a == bld->undef || b == bld->undef)
1606 return bld->undef;
1607
1608 if(a == b)
1609 return a;
1610
1611 if (bld->type.norm) {
1612 if (!bld->type.sign) {
1613 if (a == bld->zero || b == bld->zero) {
1614 return bld->zero;
1615 }
1616 }
1617 if(a == bld->one)
1618 return b;
1619 if(b == bld->one)
1620 return a;
1621 }
1622
1623 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1624 }
1625
1626
1627 /**
1628 * Generate min(a, b)
1629 * NaN's are handled according to the behavior specified by the
1630 * nan_behavior argument.
1631 */
1632 LLVMValueRef
1633 lp_build_min_ext(struct lp_build_context *bld,
1634 LLVMValueRef a,
1635 LLVMValueRef b,
1636 enum gallivm_nan_behavior nan_behavior)
1637 {
1638 assert(lp_check_value(bld->type, a));
1639 assert(lp_check_value(bld->type, b));
1640
1641 if(a == bld->undef || b == bld->undef)
1642 return bld->undef;
1643
1644 if(a == b)
1645 return a;
1646
1647 if (bld->type.norm) {
1648 if (!bld->type.sign) {
1649 if (a == bld->zero || b == bld->zero) {
1650 return bld->zero;
1651 }
1652 }
1653 if(a == bld->one)
1654 return b;
1655 if(b == bld->one)
1656 return a;
1657 }
1658
1659 return lp_build_min_simple(bld, a, b, nan_behavior);
1660 }
1661
1662 /**
1663 * Generate max(a, b)
1664 * Do checks for special cases, but NaN behavior is undefined.
1665 */
1666 LLVMValueRef
1667 lp_build_max(struct lp_build_context *bld,
1668 LLVMValueRef a,
1669 LLVMValueRef b)
1670 {
1671 assert(lp_check_value(bld->type, a));
1672 assert(lp_check_value(bld->type, b));
1673
1674 if(a == bld->undef || b == bld->undef)
1675 return bld->undef;
1676
1677 if(a == b)
1678 return a;
1679
1680 if(bld->type.norm) {
1681 if(a == bld->one || b == bld->one)
1682 return bld->one;
1683 if (!bld->type.sign) {
1684 if (a == bld->zero) {
1685 return b;
1686 }
1687 if (b == bld->zero) {
1688 return a;
1689 }
1690 }
1691 }
1692
1693 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1694 }
1695
1696
1697 /**
1698 * Generate max(a, b)
1699 * Checks for special cases.
1700 * NaN's are handled according to the behavior specified by the
1701 * nan_behavior argument.
1702 */
1703 LLVMValueRef
1704 lp_build_max_ext(struct lp_build_context *bld,
1705 LLVMValueRef a,
1706 LLVMValueRef b,
1707 enum gallivm_nan_behavior nan_behavior)
1708 {
1709 assert(lp_check_value(bld->type, a));
1710 assert(lp_check_value(bld->type, b));
1711
1712 if(a == bld->undef || b == bld->undef)
1713 return bld->undef;
1714
1715 if(a == b)
1716 return a;
1717
1718 if(bld->type.norm) {
1719 if(a == bld->one || b == bld->one)
1720 return bld->one;
1721 if (!bld->type.sign) {
1722 if (a == bld->zero) {
1723 return b;
1724 }
1725 if (b == bld->zero) {
1726 return a;
1727 }
1728 }
1729 }
1730
1731 return lp_build_max_simple(bld, a, b, nan_behavior);
1732 }
1733
1734 /**
1735 * Generate clamp(a, min, max)
1736 * NaN behavior (for any of a, min, max) is undefined.
1737 * Do checks for special cases.
1738 */
1739 LLVMValueRef
1740 lp_build_clamp(struct lp_build_context *bld,
1741 LLVMValueRef a,
1742 LLVMValueRef min,
1743 LLVMValueRef max)
1744 {
1745 assert(lp_check_value(bld->type, a));
1746 assert(lp_check_value(bld->type, min));
1747 assert(lp_check_value(bld->type, max));
1748
1749 a = lp_build_min(bld, a, max);
1750 a = lp_build_max(bld, a, min);
1751 return a;
1752 }
1753
1754
1755 /**
1756 * Generate clamp(a, 0, 1)
1757 * A NaN will get converted to zero.
1758 */
1759 LLVMValueRef
1760 lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1761 LLVMValueRef a)
1762 {
1763 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1764 a = lp_build_min(bld, a, bld->one);
1765 return a;
1766 }
1767
1768
1769 /**
1770 * Generate abs(a)
1771 */
1772 LLVMValueRef
1773 lp_build_abs(struct lp_build_context *bld,
1774 LLVMValueRef a)
1775 {
1776 LLVMBuilderRef builder = bld->gallivm->builder;
1777 const struct lp_type type = bld->type;
1778 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1779
1780 assert(lp_check_value(type, a));
1781
1782 if(!type.sign)
1783 return a;
1784
1785 if(type.floating) {
1786 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1787 /* Workaround llvm.org/PR27332 */
1788 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1789 unsigned long long absMask = ~(1ULL << (type.width - 1));
1790 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1791 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1792 a = LLVMBuildAnd(builder, a, mask, "");
1793 a = LLVMBuildBitCast(builder, a, vec_type, "");
1794 return a;
1795 } else {
1796 char intrinsic[32];
1797 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1798 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1799 }
1800 }
1801
1802 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1803 switch(type.width) {
1804 case 8:
1805 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1806 case 16:
1807 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1808 case 32:
1809 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1810 }
1811 }
1812 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) {
1813 switch(type.width) {
1814 case 8:
1815 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1816 case 16:
1817 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1818 case 32:
1819 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1820 }
1821 }
1822 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1823 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1824 (type.width == 8 || type.width == 16 || type.width == 32)) {
1825 debug_printf("%s: inefficient code, should split vectors manually\n",
1826 __FUNCTION__);
1827 }
1828
1829 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1830 }
1831
1832
1833 LLVMValueRef
1834 lp_build_negate(struct lp_build_context *bld,
1835 LLVMValueRef a)
1836 {
1837 LLVMBuilderRef builder = bld->gallivm->builder;
1838
1839 assert(lp_check_value(bld->type, a));
1840
1841 if (bld->type.floating)
1842 a = LLVMBuildFNeg(builder, a, "");
1843 else
1844 a = LLVMBuildNeg(builder, a, "");
1845
1846 return a;
1847 }
1848
1849
1850 /** Return -1, 0 or +1 depending on the sign of a */
1851 LLVMValueRef
1852 lp_build_sgn(struct lp_build_context *bld,
1853 LLVMValueRef a)
1854 {
1855 LLVMBuilderRef builder = bld->gallivm->builder;
1856 const struct lp_type type = bld->type;
1857 LLVMValueRef cond;
1858 LLVMValueRef res;
1859
1860 assert(lp_check_value(type, a));
1861
1862 /* Handle non-zero case */
1863 if(!type.sign) {
1864 /* if not zero then sign must be positive */
1865 res = bld->one;
1866 }
1867 else if(type.floating) {
1868 LLVMTypeRef vec_type;
1869 LLVMTypeRef int_type;
1870 LLVMValueRef mask;
1871 LLVMValueRef sign;
1872 LLVMValueRef one;
1873 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1874
1875 int_type = lp_build_int_vec_type(bld->gallivm, type);
1876 vec_type = lp_build_vec_type(bld->gallivm, type);
1877 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1878
1879 /* Take the sign bit and add it to 1 constant */
1880 sign = LLVMBuildBitCast(builder, a, int_type, "");
1881 sign = LLVMBuildAnd(builder, sign, mask, "");
1882 one = LLVMConstBitCast(bld->one, int_type);
1883 res = LLVMBuildOr(builder, sign, one, "");
1884 res = LLVMBuildBitCast(builder, res, vec_type, "");
1885 }
1886 else
1887 {
1888 /* signed int/norm/fixed point */
1889 /* could use psign with sse3 and appropriate vectors here */
1890 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1891 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1892 res = lp_build_select(bld, cond, bld->one, minus_one);
1893 }
1894
1895 /* Handle zero */
1896 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1897 res = lp_build_select(bld, cond, bld->zero, res);
1898
1899 return res;
1900 }
1901
1902
1903 /**
1904 * Set the sign of float vector 'a' according to 'sign'.
1905 * If sign==0, return abs(a).
1906 * If sign==1, return -abs(a);
1907 * Other values for sign produce undefined results.
1908 */
1909 LLVMValueRef
1910 lp_build_set_sign(struct lp_build_context *bld,
1911 LLVMValueRef a, LLVMValueRef sign)
1912 {
1913 LLVMBuilderRef builder = bld->gallivm->builder;
1914 const struct lp_type type = bld->type;
1915 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1916 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1917 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1918 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1919 ~((unsigned long long) 1 << (type.width - 1)));
1920 LLVMValueRef val, res;
1921
1922 assert(type.floating);
1923 assert(lp_check_value(type, a));
1924
1925 /* val = reinterpret_cast<int>(a) */
1926 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1927 /* val = val & mask */
1928 val = LLVMBuildAnd(builder, val, mask, "");
1929 /* sign = sign << shift */
1930 sign = LLVMBuildShl(builder, sign, shift, "");
1931 /* res = val | sign */
1932 res = LLVMBuildOr(builder, val, sign, "");
1933 /* res = reinterpret_cast<float>(res) */
1934 res = LLVMBuildBitCast(builder, res, vec_type, "");
1935
1936 return res;
1937 }
1938
1939
1940 /**
1941 * Convert vector of (or scalar) int to vector of (or scalar) float.
1942 */
1943 LLVMValueRef
1944 lp_build_int_to_float(struct lp_build_context *bld,
1945 LLVMValueRef a)
1946 {
1947 LLVMBuilderRef builder = bld->gallivm->builder;
1948 const struct lp_type type = bld->type;
1949 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1950
1951 assert(type.floating);
1952
1953 return LLVMBuildSIToFP(builder, a, vec_type, "");
1954 }
1955
1956 static boolean
1957 arch_rounding_available(const struct lp_type type)
1958 {
1959 if ((util_cpu_caps.has_sse4_1 &&
1960 (type.length == 1 || type.width*type.length == 128)) ||
1961 (util_cpu_caps.has_avx && type.width*type.length == 256))
1962 return TRUE;
1963 else if ((util_cpu_caps.has_altivec &&
1964 (type.width == 32 && type.length == 4)))
1965 return TRUE;
1966
1967 return FALSE;
1968 }
1969
1970 enum lp_build_round_mode
1971 {
1972 LP_BUILD_ROUND_NEAREST = 0,
1973 LP_BUILD_ROUND_FLOOR = 1,
1974 LP_BUILD_ROUND_CEIL = 2,
1975 LP_BUILD_ROUND_TRUNCATE = 3
1976 };
1977
1978 static inline LLVMValueRef
1979 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1980 LLVMValueRef a)
1981 {
1982 LLVMBuilderRef builder = bld->gallivm->builder;
1983 const struct lp_type type = bld->type;
1984 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1985 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1986 const char *intrinsic;
1987 LLVMValueRef res;
1988
1989 assert(type.floating);
1990 /* using the double precision conversions is a bit more complicated */
1991 assert(type.width == 32);
1992
1993 assert(lp_check_value(type, a));
1994 assert(util_cpu_caps.has_sse2);
1995
1996 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1997 if (type.length == 1) {
1998 LLVMTypeRef vec_type;
1999 LLVMValueRef undef;
2000 LLVMValueRef arg;
2001 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2002
2003 vec_type = LLVMVectorType(bld->elem_type, 4);
2004
2005 intrinsic = "llvm.x86.sse.cvtss2si";
2006
2007 undef = LLVMGetUndef(vec_type);
2008
2009 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2010
2011 res = lp_build_intrinsic_unary(builder, intrinsic,
2012 ret_type, arg);
2013 }
2014 else {
2015 if (type.width* type.length == 128) {
2016 intrinsic = "llvm.x86.sse2.cvtps2dq";
2017 }
2018 else {
2019 assert(type.width*type.length == 256);
2020 assert(util_cpu_caps.has_avx);
2021
2022 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2023 }
2024 res = lp_build_intrinsic_unary(builder, intrinsic,
2025 ret_type, a);
2026 }
2027
2028 return res;
2029 }
2030
2031
2032 /*
2033 */
2034 static inline LLVMValueRef
2035 lp_build_round_altivec(struct lp_build_context *bld,
2036 LLVMValueRef a,
2037 enum lp_build_round_mode mode)
2038 {
2039 LLVMBuilderRef builder = bld->gallivm->builder;
2040 const struct lp_type type = bld->type;
2041 const char *intrinsic = NULL;
2042
2043 assert(type.floating);
2044
2045 assert(lp_check_value(type, a));
2046 assert(util_cpu_caps.has_altivec);
2047
2048 (void)type;
2049
2050 switch (mode) {
2051 case LP_BUILD_ROUND_NEAREST:
2052 intrinsic = "llvm.ppc.altivec.vrfin";
2053 break;
2054 case LP_BUILD_ROUND_FLOOR:
2055 intrinsic = "llvm.ppc.altivec.vrfim";
2056 break;
2057 case LP_BUILD_ROUND_CEIL:
2058 intrinsic = "llvm.ppc.altivec.vrfip";
2059 break;
2060 case LP_BUILD_ROUND_TRUNCATE:
2061 intrinsic = "llvm.ppc.altivec.vrfiz";
2062 break;
2063 }
2064
2065 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2066 }
2067
2068 static inline LLVMValueRef
2069 lp_build_round_arch(struct lp_build_context *bld,
2070 LLVMValueRef a,
2071 enum lp_build_round_mode mode)
2072 {
2073 if (util_cpu_caps.has_sse4_1) {
2074 LLVMBuilderRef builder = bld->gallivm->builder;
2075 const struct lp_type type = bld->type;
2076 const char *intrinsic_root;
2077 char intrinsic[32];
2078
2079 assert(type.floating);
2080 assert(lp_check_value(type, a));
2081 (void)type;
2082
2083 switch (mode) {
2084 case LP_BUILD_ROUND_NEAREST:
2085 intrinsic_root = "llvm.nearbyint";
2086 break;
2087 case LP_BUILD_ROUND_FLOOR:
2088 intrinsic_root = "llvm.floor";
2089 break;
2090 case LP_BUILD_ROUND_CEIL:
2091 intrinsic_root = "llvm.ceil";
2092 break;
2093 case LP_BUILD_ROUND_TRUNCATE:
2094 intrinsic_root = "llvm.trunc";
2095 break;
2096 }
2097
2098 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2099 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2100 }
2101 else /* (util_cpu_caps.has_altivec) */
2102 return lp_build_round_altivec(bld, a, mode);
2103 }
2104
2105 /**
2106 * Return the integer part of a float (vector) value (== round toward zero).
2107 * The returned value is a float (vector).
2108 * Ex: trunc(-1.5) = -1.0
2109 */
2110 LLVMValueRef
2111 lp_build_trunc(struct lp_build_context *bld,
2112 LLVMValueRef a)
2113 {
2114 LLVMBuilderRef builder = bld->gallivm->builder;
2115 const struct lp_type type = bld->type;
2116
2117 assert(type.floating);
2118 assert(lp_check_value(type, a));
2119
2120 if (arch_rounding_available(type)) {
2121 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2122 }
2123 else {
2124 const struct lp_type type = bld->type;
2125 struct lp_type inttype;
2126 struct lp_build_context intbld;
2127 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2128 LLVMValueRef trunc, res, anosign, mask;
2129 LLVMTypeRef int_vec_type = bld->int_vec_type;
2130 LLVMTypeRef vec_type = bld->vec_type;
2131
2132 assert(type.width == 32); /* might want to handle doubles at some point */
2133
2134 inttype = type;
2135 inttype.floating = 0;
2136 lp_build_context_init(&intbld, bld->gallivm, inttype);
2137
2138 /* round by truncation */
2139 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2140 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2141
2142 /* mask out sign bit */
2143 anosign = lp_build_abs(bld, a);
2144 /*
2145 * mask out all values if anosign > 2^24
2146 * This should work both for large ints (all rounding is no-op for them
2147 * because such floats are always exact) as well as special cases like
2148 * NaNs, Infs (taking advantage of the fact they use max exponent).
2149 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2150 */
2151 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2152 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2153 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2154 return lp_build_select(bld, mask, a, res);
2155 }
2156 }
2157
2158
2159 /**
2160 * Return float (vector) rounded to nearest integer (vector). The returned
2161 * value is a float (vector).
2162 * Ex: round(0.9) = 1.0
2163 * Ex: round(-1.5) = -2.0
2164 */
2165 LLVMValueRef
2166 lp_build_round(struct lp_build_context *bld,
2167 LLVMValueRef a)
2168 {
2169 LLVMBuilderRef builder = bld->gallivm->builder;
2170 const struct lp_type type = bld->type;
2171
2172 assert(type.floating);
2173 assert(lp_check_value(type, a));
2174
2175 if (arch_rounding_available(type)) {
2176 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2177 }
2178 else {
2179 const struct lp_type type = bld->type;
2180 struct lp_type inttype;
2181 struct lp_build_context intbld;
2182 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2183 LLVMValueRef res, anosign, mask;
2184 LLVMTypeRef int_vec_type = bld->int_vec_type;
2185 LLVMTypeRef vec_type = bld->vec_type;
2186
2187 assert(type.width == 32); /* might want to handle doubles at some point */
2188
2189 inttype = type;
2190 inttype.floating = 0;
2191 lp_build_context_init(&intbld, bld->gallivm, inttype);
2192
2193 res = lp_build_iround(bld, a);
2194 res = LLVMBuildSIToFP(builder, res, vec_type, "");
2195
2196 /* mask out sign bit */
2197 anosign = lp_build_abs(bld, a);
2198 /*
2199 * mask out all values if anosign > 2^24
2200 * This should work both for large ints (all rounding is no-op for them
2201 * because such floats are always exact) as well as special cases like
2202 * NaNs, Infs (taking advantage of the fact they use max exponent).
2203 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2204 */
2205 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2206 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2207 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2208 return lp_build_select(bld, mask, a, res);
2209 }
2210 }
2211
2212
2213 /**
2214 * Return floor of float (vector), result is a float (vector)
2215 * Ex: floor(1.1) = 1.0
2216 * Ex: floor(-1.1) = -2.0
2217 */
2218 LLVMValueRef
2219 lp_build_floor(struct lp_build_context *bld,
2220 LLVMValueRef a)
2221 {
2222 LLVMBuilderRef builder = bld->gallivm->builder;
2223 const struct lp_type type = bld->type;
2224
2225 assert(type.floating);
2226 assert(lp_check_value(type, a));
2227
2228 if (arch_rounding_available(type)) {
2229 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2230 }
2231 else {
2232 const struct lp_type type = bld->type;
2233 struct lp_type inttype;
2234 struct lp_build_context intbld;
2235 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2236 LLVMValueRef trunc, res, anosign, mask;
2237 LLVMTypeRef int_vec_type = bld->int_vec_type;
2238 LLVMTypeRef vec_type = bld->vec_type;
2239
2240 if (type.width != 32) {
2241 char intrinsic[32];
2242 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2243 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2244 }
2245
2246 assert(type.width == 32); /* might want to handle doubles at some point */
2247
2248 inttype = type;
2249 inttype.floating = 0;
2250 lp_build_context_init(&intbld, bld->gallivm, inttype);
2251
2252 /* round by truncation */
2253 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2254 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2255
2256 if (type.sign) {
2257 LLVMValueRef tmp;
2258
2259 /*
2260 * fix values if rounding is wrong (for non-special cases)
2261 * - this is the case if trunc > a
2262 */
2263 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2264 /* tmp = trunc > a ? 1.0 : 0.0 */
2265 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2266 tmp = lp_build_and(&intbld, mask, tmp);
2267 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2268 res = lp_build_sub(bld, res, tmp);
2269 }
2270
2271 /* mask out sign bit */
2272 anosign = lp_build_abs(bld, a);
2273 /*
2274 * mask out all values if anosign > 2^24
2275 * This should work both for large ints (all rounding is no-op for them
2276 * because such floats are always exact) as well as special cases like
2277 * NaNs, Infs (taking advantage of the fact they use max exponent).
2278 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2279 */
2280 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2281 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2282 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2283 return lp_build_select(bld, mask, a, res);
2284 }
2285 }
2286
2287
2288 /**
2289 * Return ceiling of float (vector), returning float (vector).
2290 * Ex: ceil( 1.1) = 2.0
2291 * Ex: ceil(-1.1) = -1.0
2292 */
2293 LLVMValueRef
2294 lp_build_ceil(struct lp_build_context *bld,
2295 LLVMValueRef a)
2296 {
2297 LLVMBuilderRef builder = bld->gallivm->builder;
2298 const struct lp_type type = bld->type;
2299
2300 assert(type.floating);
2301 assert(lp_check_value(type, a));
2302
2303 if (arch_rounding_available(type)) {
2304 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2305 }
2306 else {
2307 const struct lp_type type = bld->type;
2308 struct lp_type inttype;
2309 struct lp_build_context intbld;
2310 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2311 LLVMValueRef trunc, res, anosign, mask, tmp;
2312 LLVMTypeRef int_vec_type = bld->int_vec_type;
2313 LLVMTypeRef vec_type = bld->vec_type;
2314
2315 if (type.width != 32) {
2316 char intrinsic[32];
2317 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2318 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2319 }
2320
2321 assert(type.width == 32); /* might want to handle doubles at some point */
2322
2323 inttype = type;
2324 inttype.floating = 0;
2325 lp_build_context_init(&intbld, bld->gallivm, inttype);
2326
2327 /* round by truncation */
2328 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2329 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2330
2331 /*
2332 * fix values if rounding is wrong (for non-special cases)
2333 * - this is the case if trunc < a
2334 */
2335 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2336 /* tmp = trunc < a ? 1.0 : 0.0 */
2337 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2338 tmp = lp_build_and(&intbld, mask, tmp);
2339 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2340 res = lp_build_add(bld, trunc, tmp);
2341
2342 /* mask out sign bit */
2343 anosign = lp_build_abs(bld, a);
2344 /*
2345 * mask out all values if anosign > 2^24
2346 * This should work both for large ints (all rounding is no-op for them
2347 * because such floats are always exact) as well as special cases like
2348 * NaNs, Infs (taking advantage of the fact they use max exponent).
2349 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2350 */
2351 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2352 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2353 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2354 return lp_build_select(bld, mask, a, res);
2355 }
2356 }
2357
2358
2359 /**
2360 * Return fractional part of 'a' computed as a - floor(a)
2361 * Typically used in texture coord arithmetic.
2362 */
2363 LLVMValueRef
2364 lp_build_fract(struct lp_build_context *bld,
2365 LLVMValueRef a)
2366 {
2367 assert(bld->type.floating);
2368 return lp_build_sub(bld, a, lp_build_floor(bld, a));
2369 }
2370
2371
2372 /**
2373 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2374 * against 0.99999(9). (Will also return that value for NaNs.)
2375 */
2376 static inline LLVMValueRef
2377 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2378 {
2379 LLVMValueRef max;
2380
2381 /* this is the largest number smaller than 1.0 representable as float */
2382 max = lp_build_const_vec(bld->gallivm, bld->type,
2383 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2384 return lp_build_min_ext(bld, fract, max,
2385 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2386 }
2387
2388
2389 /**
2390 * Same as lp_build_fract, but guarantees that the result is always smaller
2391 * than one. Will also return the smaller-than-one value for infs, NaNs.
2392 */
2393 LLVMValueRef
2394 lp_build_fract_safe(struct lp_build_context *bld,
2395 LLVMValueRef a)
2396 {
2397 return clamp_fract(bld, lp_build_fract(bld, a));
2398 }
2399
2400
2401 /**
2402 * Return the integer part of a float (vector) value (== round toward zero).
2403 * The returned value is an integer (vector).
2404 * Ex: itrunc(-1.5) = -1
2405 */
2406 LLVMValueRef
2407 lp_build_itrunc(struct lp_build_context *bld,
2408 LLVMValueRef a)
2409 {
2410 LLVMBuilderRef builder = bld->gallivm->builder;
2411 const struct lp_type type = bld->type;
2412 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2413
2414 assert(type.floating);
2415 assert(lp_check_value(type, a));
2416
2417 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2418 }
2419
2420
2421 /**
2422 * Return float (vector) rounded to nearest integer (vector). The returned
2423 * value is an integer (vector).
2424 * Ex: iround(0.9) = 1
2425 * Ex: iround(-1.5) = -2
2426 */
2427 LLVMValueRef
2428 lp_build_iround(struct lp_build_context *bld,
2429 LLVMValueRef a)
2430 {
2431 LLVMBuilderRef builder = bld->gallivm->builder;
2432 const struct lp_type type = bld->type;
2433 LLVMTypeRef int_vec_type = bld->int_vec_type;
2434 LLVMValueRef res;
2435
2436 assert(type.floating);
2437
2438 assert(lp_check_value(type, a));
2439
2440 if ((util_cpu_caps.has_sse2 &&
2441 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2442 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2443 return lp_build_iround_nearest_sse2(bld, a);
2444 }
2445 if (arch_rounding_available(type)) {
2446 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2447 }
2448 else {
2449 LLVMValueRef half;
2450
2451 half = lp_build_const_vec(bld->gallivm, type, 0.5);
2452
2453 if (type.sign) {
2454 LLVMTypeRef vec_type = bld->vec_type;
2455 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2456 (unsigned long long)1 << (type.width - 1));
2457 LLVMValueRef sign;
2458
2459 /* get sign bit */
2460 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2461 sign = LLVMBuildAnd(builder, sign, mask, "");
2462
2463 /* sign * 0.5 */
2464 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2465 half = LLVMBuildOr(builder, sign, half, "");
2466 half = LLVMBuildBitCast(builder, half, vec_type, "");
2467 }
2468
2469 res = LLVMBuildFAdd(builder, a, half, "");
2470 }
2471
2472 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2473
2474 return res;
2475 }
2476
2477
2478 /**
2479 * Return floor of float (vector), result is an int (vector)
2480 * Ex: ifloor(1.1) = 1.0
2481 * Ex: ifloor(-1.1) = -2.0
2482 */
2483 LLVMValueRef
2484 lp_build_ifloor(struct lp_build_context *bld,
2485 LLVMValueRef a)
2486 {
2487 LLVMBuilderRef builder = bld->gallivm->builder;
2488 const struct lp_type type = bld->type;
2489 LLVMTypeRef int_vec_type = bld->int_vec_type;
2490 LLVMValueRef res;
2491
2492 assert(type.floating);
2493 assert(lp_check_value(type, a));
2494
2495 res = a;
2496 if (type.sign) {
2497 if (arch_rounding_available(type)) {
2498 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2499 }
2500 else {
2501 struct lp_type inttype;
2502 struct lp_build_context intbld;
2503 LLVMValueRef trunc, itrunc, mask;
2504
2505 assert(type.floating);
2506 assert(lp_check_value(type, a));
2507
2508 inttype = type;
2509 inttype.floating = 0;
2510 lp_build_context_init(&intbld, bld->gallivm, inttype);
2511
2512 /* round by truncation */
2513 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2514 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2515
2516 /*
2517 * fix values if rounding is wrong (for non-special cases)
2518 * - this is the case if trunc > a
2519 * The results of doing this with NaNs, very large values etc.
2520 * are undefined but this seems to be the case anyway.
2521 */
2522 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2523 /* cheapie minus one with mask since the mask is minus one / zero */
2524 return lp_build_add(&intbld, itrunc, mask);
2525 }
2526 }
2527
2528 /* round to nearest (toward zero) */
2529 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2530
2531 return res;
2532 }
2533
2534
2535 /**
2536 * Return ceiling of float (vector), returning int (vector).
2537 * Ex: iceil( 1.1) = 2
2538 * Ex: iceil(-1.1) = -1
2539 */
2540 LLVMValueRef
2541 lp_build_iceil(struct lp_build_context *bld,
2542 LLVMValueRef a)
2543 {
2544 LLVMBuilderRef builder = bld->gallivm->builder;
2545 const struct lp_type type = bld->type;
2546 LLVMTypeRef int_vec_type = bld->int_vec_type;
2547 LLVMValueRef res;
2548
2549 assert(type.floating);
2550 assert(lp_check_value(type, a));
2551
2552 if (arch_rounding_available(type)) {
2553 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2554 }
2555 else {
2556 struct lp_type inttype;
2557 struct lp_build_context intbld;
2558 LLVMValueRef trunc, itrunc, mask;
2559
2560 assert(type.floating);
2561 assert(lp_check_value(type, a));
2562
2563 inttype = type;
2564 inttype.floating = 0;
2565 lp_build_context_init(&intbld, bld->gallivm, inttype);
2566
2567 /* round by truncation */
2568 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2569 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2570
2571 /*
2572 * fix values if rounding is wrong (for non-special cases)
2573 * - this is the case if trunc < a
2574 * The results of doing this with NaNs, very large values etc.
2575 * are undefined but this seems to be the case anyway.
2576 */
2577 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2578 /* cheapie plus one with mask since the mask is minus one / zero */
2579 return lp_build_sub(&intbld, itrunc, mask);
2580 }
2581
2582 /* round to nearest (toward zero) */
2583 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2584
2585 return res;
2586 }
2587
2588
2589 /**
2590 * Combined ifloor() & fract().
2591 *
2592 * Preferred to calling the functions separately, as it will ensure that the
2593 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2594 */
2595 void
2596 lp_build_ifloor_fract(struct lp_build_context *bld,
2597 LLVMValueRef a,
2598 LLVMValueRef *out_ipart,
2599 LLVMValueRef *out_fpart)
2600 {
2601 LLVMBuilderRef builder = bld->gallivm->builder;
2602 const struct lp_type type = bld->type;
2603 LLVMValueRef ipart;
2604
2605 assert(type.floating);
2606 assert(lp_check_value(type, a));
2607
2608 if (arch_rounding_available(type)) {
2609 /*
2610 * floor() is easier.
2611 */
2612
2613 ipart = lp_build_floor(bld, a);
2614 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2615 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2616 }
2617 else {
2618 /*
2619 * ifloor() is easier.
2620 */
2621
2622 *out_ipart = lp_build_ifloor(bld, a);
2623 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2624 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2625 }
2626 }
2627
2628
2629 /**
2630 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2631 * always smaller than one.
2632 */
2633 void
2634 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2635 LLVMValueRef a,
2636 LLVMValueRef *out_ipart,
2637 LLVMValueRef *out_fpart)
2638 {
2639 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2640 *out_fpart = clamp_fract(bld, *out_fpart);
2641 }
2642
2643
2644 LLVMValueRef
2645 lp_build_sqrt(struct lp_build_context *bld,
2646 LLVMValueRef a)
2647 {
2648 LLVMBuilderRef builder = bld->gallivm->builder;
2649 const struct lp_type type = bld->type;
2650 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2651 char intrinsic[32];
2652
2653 assert(lp_check_value(type, a));
2654
2655 assert(type.floating);
2656 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2657
2658 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2659 }
2660
2661
2662 /**
2663 * Do one Newton-Raphson step to improve reciprocate precision:
2664 *
2665 * x_{i+1} = x_i * (2 - a * x_i)
2666 *
2667 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2668 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2669 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2670 * halo. It would be necessary to clamp the argument to prevent this.
2671 *
2672 * See also:
2673 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2674 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2675 */
2676 static inline LLVMValueRef
2677 lp_build_rcp_refine(struct lp_build_context *bld,
2678 LLVMValueRef a,
2679 LLVMValueRef rcp_a)
2680 {
2681 LLVMBuilderRef builder = bld->gallivm->builder;
2682 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2683 LLVMValueRef res;
2684
2685 res = LLVMBuildFMul(builder, a, rcp_a, "");
2686 res = LLVMBuildFSub(builder, two, res, "");
2687 res = LLVMBuildFMul(builder, rcp_a, res, "");
2688
2689 return res;
2690 }
2691
2692
2693 LLVMValueRef
2694 lp_build_rcp(struct lp_build_context *bld,
2695 LLVMValueRef a)
2696 {
2697 LLVMBuilderRef builder = bld->gallivm->builder;
2698 const struct lp_type type = bld->type;
2699
2700 assert(lp_check_value(type, a));
2701
2702 if(a == bld->zero)
2703 return bld->undef;
2704 if(a == bld->one)
2705 return bld->one;
2706 if(a == bld->undef)
2707 return bld->undef;
2708
2709 assert(type.floating);
2710
2711 if(LLVMIsConstant(a))
2712 return LLVMConstFDiv(bld->one, a);
2713
2714 /*
2715 * We don't use RCPPS because:
2716 * - it only has 10bits of precision
2717 * - it doesn't even get the reciprocate of 1.0 exactly
2718 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2719 * - for recent processors the benefit over DIVPS is marginal, a case
2720 * dependent
2721 *
2722 * We could still use it on certain processors if benchmarks show that the
2723 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2724 * particular uses that require less workarounds.
2725 */
2726
2727 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2728 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2729 const unsigned num_iterations = 0;
2730 LLVMValueRef res;
2731 unsigned i;
2732 const char *intrinsic = NULL;
2733
2734 if (type.length == 4) {
2735 intrinsic = "llvm.x86.sse.rcp.ps";
2736 }
2737 else {
2738 intrinsic = "llvm.x86.avx.rcp.ps.256";
2739 }
2740
2741 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2742
2743 for (i = 0; i < num_iterations; ++i) {
2744 res = lp_build_rcp_refine(bld, a, res);
2745 }
2746
2747 return res;
2748 }
2749
2750 return LLVMBuildFDiv(builder, bld->one, a, "");
2751 }
2752
2753
2754 /**
2755 * Do one Newton-Raphson step to improve rsqrt precision:
2756 *
2757 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2758 *
2759 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2760 */
2761 static inline LLVMValueRef
2762 lp_build_rsqrt_refine(struct lp_build_context *bld,
2763 LLVMValueRef a,
2764 LLVMValueRef rsqrt_a)
2765 {
2766 LLVMBuilderRef builder = bld->gallivm->builder;
2767 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2768 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2769 LLVMValueRef res;
2770
2771 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2772 res = LLVMBuildFMul(builder, a, res, "");
2773 res = LLVMBuildFSub(builder, three, res, "");
2774 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2775 res = LLVMBuildFMul(builder, half, res, "");
2776
2777 return res;
2778 }
2779
2780
2781 /**
2782 * Generate 1/sqrt(a).
2783 * Result is undefined for values < 0, infinity for +0.
2784 */
2785 LLVMValueRef
2786 lp_build_rsqrt(struct lp_build_context *bld,
2787 LLVMValueRef a)
2788 {
2789 const struct lp_type type = bld->type;
2790
2791 assert(lp_check_value(type, a));
2792
2793 assert(type.floating);
2794
2795 /*
2796 * This should be faster but all denormals will end up as infinity.
2797 */
2798 if (0 && lp_build_fast_rsqrt_available(type)) {
2799 const unsigned num_iterations = 1;
2800 LLVMValueRef res;
2801 unsigned i;
2802
2803 /* rsqrt(1.0) != 1.0 here */
2804 res = lp_build_fast_rsqrt(bld, a);
2805
2806 if (num_iterations) {
2807 /*
2808 * Newton-Raphson will result in NaN instead of infinity for zero,
2809 * and NaN instead of zero for infinity.
2810 * Also, need to ensure rsqrt(1.0) == 1.0.
2811 * All numbers smaller than FLT_MIN will result in +infinity
2812 * (rsqrtps treats all denormals as zero).
2813 */
2814 LLVMValueRef cmp;
2815 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2816 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2817
2818 for (i = 0; i < num_iterations; ++i) {
2819 res = lp_build_rsqrt_refine(bld, a, res);
2820 }
2821 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2822 res = lp_build_select(bld, cmp, inf, res);
2823 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2824 res = lp_build_select(bld, cmp, bld->zero, res);
2825 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2826 res = lp_build_select(bld, cmp, bld->one, res);
2827 }
2828
2829 return res;
2830 }
2831
2832 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2833 }
2834
2835 /**
2836 * If there's a fast (inaccurate) rsqrt instruction available
2837 * (caller may want to avoid to call rsqrt_fast if it's not available,
2838 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2839 * unavailable it would result in sqrt/div/mul so obviously
2840 * much better to just call sqrt, skipping both div and mul).
2841 */
2842 boolean
2843 lp_build_fast_rsqrt_available(struct lp_type type)
2844 {
2845 assert(type.floating);
2846
2847 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2848 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2849 return true;
2850 }
2851 return false;
2852 }
2853
2854
2855 /**
2856 * Generate 1/sqrt(a).
2857 * Result is undefined for values < 0, infinity for +0.
2858 * Precision is limited, only ~10 bits guaranteed
2859 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2860 */
2861 LLVMValueRef
2862 lp_build_fast_rsqrt(struct lp_build_context *bld,
2863 LLVMValueRef a)
2864 {
2865 LLVMBuilderRef builder = bld->gallivm->builder;
2866 const struct lp_type type = bld->type;
2867
2868 assert(lp_check_value(type, a));
2869
2870 if (lp_build_fast_rsqrt_available(type)) {
2871 const char *intrinsic = NULL;
2872
2873 if (type.length == 4) {
2874 intrinsic = "llvm.x86.sse.rsqrt.ps";
2875 }
2876 else {
2877 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2878 }
2879 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2880 }
2881 else {
2882 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2883 }
2884 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2885 }
2886
2887
2888 /**
2889 * Generate sin(a) or cos(a) using polynomial approximation.
2890 * TODO: it might be worth recognizing sin and cos using same source
2891 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2892 * would be way cheaper than calculating (nearly) everything twice...
2893 * Not sure it's common enough to be worth bothering however, scs
2894 * opcode could also benefit from calculating both though.
2895 */
2896 static LLVMValueRef
2897 lp_build_sin_or_cos(struct lp_build_context *bld,
2898 LLVMValueRef a,
2899 boolean cos)
2900 {
2901 struct gallivm_state *gallivm = bld->gallivm;
2902 LLVMBuilderRef b = gallivm->builder;
2903 struct lp_type int_type = lp_int_type(bld->type);
2904
2905 /*
2906 * take the absolute value,
2907 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2908 */
2909
2910 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2911 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2912
2913 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2914 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2915
2916 /*
2917 * scale by 4/Pi
2918 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2919 */
2920
2921 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2922 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2923
2924 /*
2925 * store the integer part of y in mm0
2926 * emm2 = _mm_cvttps_epi32(y);
2927 */
2928
2929 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2930
2931 /*
2932 * j=(j+1) & (~1) (see the cephes sources)
2933 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2934 */
2935
2936 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2937 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2938 /*
2939 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2940 */
2941 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2942 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2943
2944 /*
2945 * y = _mm_cvtepi32_ps(emm2);
2946 */
2947 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2948
2949 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2950 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2951 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2952 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2953
2954 /*
2955 * Argument used for poly selection and sign bit determination
2956 * is different for sin vs. cos.
2957 */
2958 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2959 emm2_and;
2960
2961 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2962 LLVMBuildNot(b, emm2_2, ""), ""),
2963 const_29, "sign_bit") :
2964 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2965 LLVMBuildShl(b, emm2_add,
2966 const_29, ""), ""),
2967 sign_mask, "sign_bit");
2968
2969 /*
2970 * get the polynom selection mask
2971 * there is one polynom for 0 <= x <= Pi/4
2972 * and another one for Pi/4<x<=Pi/2
2973 * Both branches will be computed.
2974 *
2975 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2976 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2977 */
2978
2979 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2980 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2981 int_type, PIPE_FUNC_EQUAL,
2982 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2983
2984 /*
2985 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2986 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2987 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2988 */
2989 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2990 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2991 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2992
2993 /*
2994 * The magic pass: "Extended precision modular arithmetic"
2995 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2996 */
2997 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2998 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2999 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3000
3001 /*
3002 * Evaluate the first polynom (0 <= x <= Pi/4)
3003 *
3004 * z = _mm_mul_ps(x,x);
3005 */
3006 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3007
3008 /*
3009 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
3010 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3011 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
3012 */
3013 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3014 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3015 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3016
3017 /*
3018 * y = *(v4sf*)_ps_coscof_p0;
3019 * y = _mm_mul_ps(y, z);
3020 */
3021 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3022 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3023 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3024 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3025
3026
3027 /*
3028 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3029 * y = _mm_sub_ps(y, tmp);
3030 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3031 */
3032 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3033 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3034 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3035 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3036 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3037
3038 /*
3039 * _PS_CONST(sincof_p0, -1.9515295891E-4);
3040 * _PS_CONST(sincof_p1, 8.3321608736E-3);
3041 * _PS_CONST(sincof_p2, -1.6666654611E-1);
3042 */
3043 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3044 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3045 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3046
3047 /*
3048 * Evaluate the second polynom (Pi/4 <= x <= 0)
3049 *
3050 * y2 = *(v4sf*)_ps_sincof_p0;
3051 * y2 = _mm_mul_ps(y2, z);
3052 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3053 * y2 = _mm_mul_ps(y2, z);
3054 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3055 * y2 = _mm_mul_ps(y2, z);
3056 * y2 = _mm_mul_ps(y2, x);
3057 * y2 = _mm_add_ps(y2, x);
3058 */
3059
3060 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3061 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3062 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3063 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3064
3065 /*
3066 * select the correct result from the two polynoms
3067 * xmm3 = poly_mask;
3068 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3069 * y = _mm_andnot_ps(xmm3, y);
3070 * y = _mm_or_ps(y,y2);
3071 */
3072 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3073 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3074 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3075 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3076 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3077 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3078
3079 /*
3080 * update the sign
3081 * y = _mm_xor_ps(y, sign_bit);
3082 */
3083 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3084 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3085
3086 LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3087
3088 /* clamp output to be within [-1, 1] */
3089 y_result = lp_build_clamp(bld, y_result,
3090 lp_build_const_vec(bld->gallivm, bld->type, -1.f),
3091 lp_build_const_vec(bld->gallivm, bld->type, 1.f));
3092 /* If a is -inf, inf or NaN then return NaN */
3093 y_result = lp_build_select(bld, isfinite, y_result,
3094 lp_build_const_vec(bld->gallivm, bld->type, NAN));
3095 return y_result;
3096 }
3097
3098
3099 /**
3100 * Generate sin(a)
3101 */
3102 LLVMValueRef
3103 lp_build_sin(struct lp_build_context *bld,
3104 LLVMValueRef a)
3105 {
3106 return lp_build_sin_or_cos(bld, a, FALSE);
3107 }
3108
3109
3110 /**
3111 * Generate cos(a)
3112 */
3113 LLVMValueRef
3114 lp_build_cos(struct lp_build_context *bld,
3115 LLVMValueRef a)
3116 {
3117 return lp_build_sin_or_cos(bld, a, TRUE);
3118 }
3119
3120
3121 /**
3122 * Generate pow(x, y)
3123 */
3124 LLVMValueRef
3125 lp_build_pow(struct lp_build_context *bld,
3126 LLVMValueRef x,
3127 LLVMValueRef y)
3128 {
3129 /* TODO: optimize the constant case */
3130 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3131 LLVMIsConstant(x) && LLVMIsConstant(y)) {
3132 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3133 __FUNCTION__);
3134 }
3135
3136 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3137 }
3138
3139
3140 /**
3141 * Generate exp(x)
3142 */
3143 LLVMValueRef
3144 lp_build_exp(struct lp_build_context *bld,
3145 LLVMValueRef x)
3146 {
3147 /* log2(e) = 1/log(2) */
3148 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3149 1.4426950408889634);
3150
3151 assert(lp_check_value(bld->type, x));
3152
3153 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3154 }
3155
3156
3157 /**
3158 * Generate log(x)
3159 * Behavior is undefined with infs, 0s and nans
3160 */
3161 LLVMValueRef
3162 lp_build_log(struct lp_build_context *bld,
3163 LLVMValueRef x)
3164 {
3165 /* log(2) */
3166 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3167 0.69314718055994529);
3168
3169 assert(lp_check_value(bld->type, x));
3170
3171 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3172 }
3173
3174 /**
3175 * Generate log(x) that handles edge cases (infs, 0s and nans)
3176 */
3177 LLVMValueRef
3178 lp_build_log_safe(struct lp_build_context *bld,
3179 LLVMValueRef x)
3180 {
3181 /* log(2) */
3182 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3183 0.69314718055994529);
3184
3185 assert(lp_check_value(bld->type, x));
3186
3187 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3188 }
3189
3190
3191 /**
3192 * Generate polynomial.
3193 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3194 */
3195 LLVMValueRef
3196 lp_build_polynomial(struct lp_build_context *bld,
3197 LLVMValueRef x,
3198 const double *coeffs,
3199 unsigned num_coeffs)
3200 {
3201 const struct lp_type type = bld->type;
3202 LLVMValueRef even = NULL, odd = NULL;
3203 LLVMValueRef x2;
3204 unsigned i;
3205
3206 assert(lp_check_value(bld->type, x));
3207
3208 /* TODO: optimize the constant case */
3209 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3210 LLVMIsConstant(x)) {
3211 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3212 __FUNCTION__);
3213 }
3214
3215 /*
3216 * Calculate odd and even terms seperately to decrease data dependency
3217 * Ex:
3218 * c[0] + x^2 * c[2] + x^4 * c[4] ...
3219 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3220 */
3221 x2 = lp_build_mul(bld, x, x);
3222
3223 for (i = num_coeffs; i--; ) {
3224 LLVMValueRef coeff;
3225
3226 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3227
3228 if (i % 2 == 0) {
3229 if (even)
3230 even = lp_build_mad(bld, x2, even, coeff);
3231 else
3232 even = coeff;
3233 } else {
3234 if (odd)
3235 odd = lp_build_mad(bld, x2, odd, coeff);
3236 else
3237 odd = coeff;
3238 }
3239 }
3240
3241 if (odd)
3242 return lp_build_mad(bld, odd, x, even);
3243 else if (even)
3244 return even;
3245 else
3246 return bld->undef;
3247 }
3248
3249
3250 /**
3251 * Minimax polynomial fit of 2**x, in range [0, 1[
3252 */
3253 const double lp_build_exp2_polynomial[] = {
3254 #if EXP_POLY_DEGREE == 5
3255 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3256 0.693153073200168932794,
3257 0.240153617044375388211,
3258 0.0558263180532956664775,
3259 0.00898934009049466391101,
3260 0.00187757667519147912699
3261 #elif EXP_POLY_DEGREE == 4
3262 1.00000259337069434683,
3263 0.693003834469974940458,
3264 0.24144275689150793076,
3265 0.0520114606103070150235,
3266 0.0135341679161270268764
3267 #elif EXP_POLY_DEGREE == 3
3268 0.999925218562710312959,
3269 0.695833540494823811697,
3270 0.226067155427249155588,
3271 0.0780245226406372992967
3272 #elif EXP_POLY_DEGREE == 2
3273 1.00172476321474503578,
3274 0.657636275736077639316,
3275 0.33718943461968720704
3276 #else
3277 #error
3278 #endif
3279 };
3280
3281
3282 LLVMValueRef
3283 lp_build_exp2(struct lp_build_context *bld,
3284 LLVMValueRef x)
3285 {
3286 LLVMBuilderRef builder = bld->gallivm->builder;
3287 const struct lp_type type = bld->type;
3288 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3289 LLVMValueRef ipart = NULL;
3290 LLVMValueRef fpart = NULL;
3291 LLVMValueRef expipart = NULL;
3292 LLVMValueRef expfpart = NULL;
3293 LLVMValueRef res = NULL;
3294
3295 assert(lp_check_value(bld->type, x));
3296
3297 /* TODO: optimize the constant case */
3298 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3299 LLVMIsConstant(x)) {
3300 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3301 __FUNCTION__);
3302 }
3303
3304 assert(type.floating && type.width == 32);
3305
3306 /* We want to preserve NaN and make sure than for exp2 if x > 128,
3307 * the result is INF and if it's smaller than -126.9 the result is 0 */
3308 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x,
3309 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3310 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3311 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3312
3313 /* ipart = floor(x) */
3314 /* fpart = x - ipart */
3315 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3316
3317 /* expipart = (float) (1 << ipart) */
3318 expipart = LLVMBuildAdd(builder, ipart,
3319 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3320 expipart = LLVMBuildShl(builder, expipart,
3321 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3322 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3323
3324 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3325 ARRAY_SIZE(lp_build_exp2_polynomial));
3326
3327 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3328
3329 return res;
3330 }
3331
3332
3333
3334 /**
3335 * Extract the exponent of a IEEE-754 floating point value.
3336 *
3337 * Optionally apply an integer bias.
3338 *
3339 * Result is an integer value with
3340 *
3341 * ifloor(log2(x)) + bias
3342 */
3343 LLVMValueRef
3344 lp_build_extract_exponent(struct lp_build_context *bld,
3345 LLVMValueRef x,
3346 int bias)
3347 {
3348 LLVMBuilderRef builder = bld->gallivm->builder;
3349 const struct lp_type type = bld->type;
3350 unsigned mantissa = lp_mantissa(type);
3351 LLVMValueRef res;
3352
3353 assert(type.floating);
3354
3355 assert(lp_check_value(bld->type, x));
3356
3357 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3358
3359 res = LLVMBuildLShr(builder, x,
3360 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3361 res = LLVMBuildAnd(builder, res,
3362 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3363 res = LLVMBuildSub(builder, res,
3364 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3365
3366 return res;
3367 }
3368
3369
3370 /**
3371 * Extract the mantissa of the a floating.
3372 *
3373 * Result is a floating point value with
3374 *
3375 * x / floor(log2(x))
3376 */
3377 LLVMValueRef
3378 lp_build_extract_mantissa(struct lp_build_context *bld,
3379 LLVMValueRef x)
3380 {
3381 LLVMBuilderRef builder = bld->gallivm->builder;
3382 const struct lp_type type = bld->type;
3383 unsigned mantissa = lp_mantissa(type);
3384 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3385 (1ULL << mantissa) - 1);
3386 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3387 LLVMValueRef res;
3388
3389 assert(lp_check_value(bld->type, x));
3390
3391 assert(type.floating);
3392
3393 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3394
3395 /* res = x / 2**ipart */
3396 res = LLVMBuildAnd(builder, x, mantmask, "");
3397 res = LLVMBuildOr(builder, res, one, "");
3398 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3399
3400 return res;
3401 }
3402
3403
3404
3405 /**
3406 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3407 * These coefficients can be generate with
3408 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3409 */
3410 const double lp_build_log2_polynomial[] = {
3411 #if LOG_POLY_DEGREE == 5
3412 2.88539008148777786488L,
3413 0.961796878841293367824L,
3414 0.577058946784739859012L,
3415 0.412914355135828735411L,
3416 0.308591899232910175289L,
3417 0.352376952300281371868L,
3418 #elif LOG_POLY_DEGREE == 4
3419 2.88539009343309178325L,
3420 0.961791550404184197881L,
3421 0.577440339438736392009L,
3422 0.403343858251329912514L,
3423 0.406718052498846252698L,
3424 #elif LOG_POLY_DEGREE == 3
3425 2.88538959748872753838L,
3426 0.961932915889597772928L,
3427 0.571118517972136195241L,
3428 0.493997535084709500285L,
3429 #else
3430 #error
3431 #endif
3432 };
3433
3434 /**
3435 * See http://www.devmaster.net/forums/showthread.php?p=43580
3436 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3437 * http://www.nezumi.demon.co.uk/consult/logx.htm
3438 *
3439 * If handle_edge_cases is true the function will perform computations
3440 * to match the required D3D10+ behavior for each of the edge cases.
3441 * That means that if input is:
3442 * - less than zero (to and including -inf) then NaN will be returned
3443 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3444 * - +infinity, then +infinity will be returned
3445 * - NaN, then NaN will be returned
3446 *
3447 * Those checks are fairly expensive so if you don't need them make sure
3448 * handle_edge_cases is false.
3449 */
3450 void
3451 lp_build_log2_approx(struct lp_build_context *bld,
3452 LLVMValueRef x,
3453 LLVMValueRef *p_exp,
3454 LLVMValueRef *p_floor_log2,
3455 LLVMValueRef *p_log2,
3456 boolean handle_edge_cases)
3457 {
3458 LLVMBuilderRef builder = bld->gallivm->builder;
3459 const struct lp_type type = bld->type;
3460 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3461 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3462
3463 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3464 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3465 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3466
3467 LLVMValueRef i = NULL;
3468 LLVMValueRef y = NULL;
3469 LLVMValueRef z = NULL;
3470 LLVMValueRef exp = NULL;
3471 LLVMValueRef mant = NULL;
3472 LLVMValueRef logexp = NULL;
3473 LLVMValueRef p_z = NULL;
3474 LLVMValueRef res = NULL;
3475
3476 assert(lp_check_value(bld->type, x));
3477
3478 if(p_exp || p_floor_log2 || p_log2) {
3479 /* TODO: optimize the constant case */
3480 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3481 LLVMIsConstant(x)) {
3482 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3483 __FUNCTION__);
3484 }
3485
3486 assert(type.floating && type.width == 32);
3487
3488 /*
3489 * We don't explicitly handle denormalized numbers. They will yield a
3490 * result in the neighbourhood of -127, which appears to be adequate
3491 * enough.
3492 */
3493
3494 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3495
3496 /* exp = (float) exponent(x) */
3497 exp = LLVMBuildAnd(builder, i, expmask, "");
3498 }
3499
3500 if(p_floor_log2 || p_log2) {
3501 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3502 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3503 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3504 }
3505
3506 if (p_log2) {
3507 /* mant = 1 + (float) mantissa(x) */
3508 mant = LLVMBuildAnd(builder, i, mantmask, "");
3509 mant = LLVMBuildOr(builder, mant, one, "");
3510 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3511
3512 /* y = (mant - 1) / (mant + 1) */
3513 y = lp_build_div(bld,
3514 lp_build_sub(bld, mant, bld->one),
3515 lp_build_add(bld, mant, bld->one)
3516 );
3517
3518 /* z = y^2 */
3519 z = lp_build_mul(bld, y, y);
3520
3521 /* compute P(z) */
3522 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3523 ARRAY_SIZE(lp_build_log2_polynomial));
3524
3525 /* y * P(z) + logexp */
3526 res = lp_build_mad(bld, y, p_z, logexp);
3527
3528 if (type.floating && handle_edge_cases) {
3529 LLVMValueRef negmask, infmask, zmask;
3530 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3531 lp_build_const_vec(bld->gallivm, type, 0.0f));
3532 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3533 lp_build_const_vec(bld->gallivm, type, 0.0f));
3534 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3535 lp_build_const_vec(bld->gallivm, type, INFINITY));
3536
3537 /* If x is qual to inf make sure we return inf */
3538 res = lp_build_select(bld, infmask,
3539 lp_build_const_vec(bld->gallivm, type, INFINITY),
3540 res);
3541 /* If x is qual to 0, return -inf */
3542 res = lp_build_select(bld, zmask,
3543 lp_build_const_vec(bld->gallivm, type, -INFINITY),
3544 res);
3545 /* If x is nan or less than 0, return nan */
3546 res = lp_build_select(bld, negmask,
3547 lp_build_const_vec(bld->gallivm, type, NAN),
3548 res);
3549 }
3550 }
3551
3552 if (p_exp) {
3553 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3554 *p_exp = exp;
3555 }
3556
3557 if (p_floor_log2)
3558 *p_floor_log2 = logexp;
3559
3560 if (p_log2)
3561 *p_log2 = res;
3562 }
3563
3564
3565 /*
3566 * log2 implementation which doesn't have special code to
3567 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3568 * the results for those cases are undefined.
3569 */
3570 LLVMValueRef
3571 lp_build_log2(struct lp_build_context *bld,
3572 LLVMValueRef x)
3573 {
3574 LLVMValueRef res;
3575 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3576 return res;
3577 }
3578
3579 /*
3580 * Version of log2 which handles all edge cases.
3581 * Look at documentation of lp_build_log2_approx for
3582 * description of the behavior for each of the edge cases.
3583 */
3584 LLVMValueRef
3585 lp_build_log2_safe(struct lp_build_context *bld,
3586 LLVMValueRef x)
3587 {
3588 LLVMValueRef res;
3589 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3590 return res;
3591 }
3592
3593
3594 /**
3595 * Faster (and less accurate) log2.
3596 *
3597 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3598 *
3599 * Piece-wise linear approximation, with exact results when x is a
3600 * power of two.
3601 *
3602 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3603 */
3604 LLVMValueRef
3605 lp_build_fast_log2(struct lp_build_context *bld,
3606 LLVMValueRef x)
3607 {
3608 LLVMBuilderRef builder = bld->gallivm->builder;
3609 LLVMValueRef ipart;
3610 LLVMValueRef fpart;
3611
3612 assert(lp_check_value(bld->type, x));
3613
3614 assert(bld->type.floating);
3615
3616 /* ipart = floor(log2(x)) - 1 */
3617 ipart = lp_build_extract_exponent(bld, x, -1);
3618 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3619
3620 /* fpart = x / 2**ipart */
3621 fpart = lp_build_extract_mantissa(bld, x);
3622
3623 /* ipart + fpart */
3624 return LLVMBuildFAdd(builder, ipart, fpart, "");
3625 }
3626
3627
3628 /**
3629 * Fast implementation of iround(log2(x)).
3630 *
3631 * Not an approximation -- it should give accurate results all the time.
3632 */
3633 LLVMValueRef
3634 lp_build_ilog2(struct lp_build_context *bld,
3635 LLVMValueRef x)
3636 {
3637 LLVMBuilderRef builder = bld->gallivm->builder;
3638 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3639 LLVMValueRef ipart;
3640
3641 assert(bld->type.floating);
3642
3643 assert(lp_check_value(bld->type, x));
3644
3645 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3646 x = LLVMBuildFMul(builder, x, sqrt2, "");
3647
3648 /* ipart = floor(log2(x) + 0.5) */
3649 ipart = lp_build_extract_exponent(bld, x, 0);
3650
3651 return ipart;
3652 }
3653
3654 LLVMValueRef
3655 lp_build_mod(struct lp_build_context *bld,
3656 LLVMValueRef x,
3657 LLVMValueRef y)
3658 {
3659 LLVMBuilderRef builder = bld->gallivm->builder;
3660 LLVMValueRef res;
3661 const struct lp_type type = bld->type;
3662
3663 assert(lp_check_value(type, x));
3664 assert(lp_check_value(type, y));
3665
3666 if (type.floating)
3667 res = LLVMBuildFRem(builder, x, y, "");
3668 else if (type.sign)
3669 res = LLVMBuildSRem(builder, x, y, "");
3670 else
3671 res = LLVMBuildURem(builder, x, y, "");
3672 return res;
3673 }
3674
3675
3676 /*
3677 * For floating inputs it creates and returns a mask
3678 * which is all 1's for channels which are NaN.
3679 * Channels inside x which are not NaN will be 0.
3680 */
3681 LLVMValueRef
3682 lp_build_isnan(struct lp_build_context *bld,
3683 LLVMValueRef x)
3684 {
3685 LLVMValueRef mask;
3686 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3687
3688 assert(bld->type.floating);
3689 assert(lp_check_value(bld->type, x));
3690
3691 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3692 "isnotnan");
3693 mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3694 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3695 return mask;
3696 }
3697
3698 /* Returns all 1's for floating point numbers that are
3699 * finite numbers and returns all zeros for -inf,
3700 * inf and nan's */
3701 LLVMValueRef
3702 lp_build_isfinite(struct lp_build_context *bld,
3703 LLVMValueRef x)
3704 {
3705 LLVMBuilderRef builder = bld->gallivm->builder;
3706 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3707 struct lp_type int_type = lp_int_type(bld->type);
3708 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3709 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3710 0x7f800000);
3711
3712 if (!bld->type.floating) {
3713 return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3714 }
3715 assert(bld->type.floating);
3716 assert(lp_check_value(bld->type, x));
3717 assert(bld->type.width == 32);
3718
3719 intx = LLVMBuildAnd(builder, intx, infornan32, "");
3720 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3721 intx, infornan32);
3722 }
3723
3724 /*
3725 * Returns true if the number is nan or inf and false otherwise.
3726 * The input has to be a floating point vector.
3727 */
3728 LLVMValueRef
3729 lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3730 const struct lp_type type,
3731 LLVMValueRef x)
3732 {
3733 LLVMBuilderRef builder = gallivm->builder;
3734 struct lp_type int_type = lp_int_type(type);
3735 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3736 0x7f800000);
3737 LLVMValueRef ret;
3738
3739 assert(type.floating);
3740
3741 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3742 ret = LLVMBuildAnd(builder, ret, const0, "");
3743 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3744 ret, const0);
3745
3746 return ret;
3747 }
3748
3749
3750 LLVMValueRef
3751 lp_build_fpstate_get(struct gallivm_state *gallivm)
3752 {
3753 if (util_cpu_caps.has_sse) {
3754 LLVMBuilderRef builder = gallivm->builder;
3755 LLVMValueRef mxcsr_ptr = lp_build_alloca(
3756 gallivm,
3757 LLVMInt32TypeInContext(gallivm->context),
3758 "mxcsr_ptr");
3759 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3760 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3761 lp_build_intrinsic(builder,
3762 "llvm.x86.sse.stmxcsr",
3763 LLVMVoidTypeInContext(gallivm->context),
3764 &mxcsr_ptr8, 1, 0);
3765 return mxcsr_ptr;
3766 }
3767 return 0;
3768 }
3769
3770 void
3771 lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3772 boolean zero)
3773 {
3774 if (util_cpu_caps.has_sse) {
3775 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3776 int daz_ftz = _MM_FLUSH_ZERO_MASK;
3777
3778 LLVMBuilderRef builder = gallivm->builder;
3779 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3780 LLVMValueRef mxcsr =
3781 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3782
3783 if (util_cpu_caps.has_daz) {
3784 /* Enable denormals are zero mode */
3785 daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3786 }
3787 if (zero) {
3788 mxcsr = LLVMBuildOr(builder, mxcsr,
3789 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3790 } else {
3791 mxcsr = LLVMBuildAnd(builder, mxcsr,
3792 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3793 }
3794
3795 LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3796 lp_build_fpstate_set(gallivm, mxcsr_ptr);
3797 }
3798 }
3799
3800 void
3801 lp_build_fpstate_set(struct gallivm_state *gallivm,
3802 LLVMValueRef mxcsr_ptr)
3803 {
3804 if (util_cpu_caps.has_sse) {
3805 LLVMBuilderRef builder = gallivm->builder;
3806 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3807 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3808 lp_build_intrinsic(builder,
3809 "llvm.x86.sse.ldmxcsr",
3810 LLVMVoidTypeInContext(gallivm->context),
3811 &mxcsr_ptr, 1, 0);
3812 }
3813 }