util: treat denorm'ed floats like zero
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67
68 #define EXP_POLY_DEGREE 5
69
70 #define LOG_POLY_DEGREE 4
71
72
73 /**
74 * Generate min(a, b)
75 * No checks for special case values of a or b = 1 or 0 are done.
76 */
77 static LLVMValueRef
78 lp_build_min_simple(struct lp_build_context *bld,
79 LLVMValueRef a,
80 LLVMValueRef b)
81 {
82 const struct lp_type type = bld->type;
83 const char *intrinsic = NULL;
84 unsigned intr_size = 0;
85 LLVMValueRef cond;
86
87 assert(lp_check_value(type, a));
88 assert(lp_check_value(type, b));
89
90 /* TODO: optimize the constant case */
91
92 if (type.floating && util_cpu_caps.has_sse) {
93 if (type.width == 32) {
94 if (type.length == 1) {
95 intrinsic = "llvm.x86.sse.min.ss";
96 intr_size = 128;
97 }
98 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
99 intrinsic = "llvm.x86.sse.min.ps";
100 intr_size = 128;
101 }
102 else {
103 intrinsic = "llvm.x86.avx.min.ps.256";
104 intr_size = 256;
105 }
106 }
107 if (type.width == 64 && util_cpu_caps.has_sse2) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse2.min.sd";
110 intr_size = 128;
111 }
112 else if (type.length == 2 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse2.min.pd";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.pd.256";
118 intr_size = 256;
119 }
120 }
121 }
122 else if (type.floating && util_cpu_caps.has_altivec) {
123 if (type.width == 32 && type.length == 4) {
124 intrinsic = "llvm.ppc.altivec.vminfp";
125 intr_size = 128;
126 }
127 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
128 intr_size = 128;
129 if ((type.width == 8 || type.width == 16) &&
130 (type.width * type.length <= 64) &&
131 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
132 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
133 __FUNCTION__);
134 }
135 if (type.width == 8 && !type.sign) {
136 intrinsic = "llvm.x86.sse2.pminu.b";
137 }
138 else if (type.width == 16 && type.sign) {
139 intrinsic = "llvm.x86.sse2.pmins.w";
140 }
141 if (util_cpu_caps.has_sse4_1) {
142 if (type.width == 8 && type.sign) {
143 intrinsic = "llvm.x86.sse41.pminsb";
144 }
145 if (type.width == 16 && !type.sign) {
146 intrinsic = "llvm.x86.sse41.pminuw";
147 }
148 if (type.width == 32 && !type.sign) {
149 intrinsic = "llvm.x86.sse41.pminud";
150 }
151 if (type.width == 32 && type.sign) {
152 intrinsic = "llvm.x86.sse41.pminsd";
153 }
154 }
155 } else if (util_cpu_caps.has_altivec) {
156 intr_size = 128;
157 if (type.width == 8) {
158 if (!type.sign) {
159 intrinsic = "llvm.ppc.altivec.vminub";
160 } else {
161 intrinsic = "llvm.ppc.altivec.vminsb";
162 }
163 } else if (type.width == 16) {
164 if (!type.sign) {
165 intrinsic = "llvm.ppc.altivec.vminuh";
166 } else {
167 intrinsic = "llvm.ppc.altivec.vminsh";
168 }
169 } else if (type.width == 32) {
170 if (!type.sign) {
171 intrinsic = "llvm.ppc.altivec.vminuw";
172 } else {
173 intrinsic = "llvm.ppc.altivec.vminsw";
174 }
175 }
176 }
177
178 if(intrinsic) {
179 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180 type,
181 intr_size, a, b);
182 }
183
184 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
185 return lp_build_select(bld, cond, a, b);
186 }
187
188
189 /**
190 * Generate max(a, b)
191 * No checks for special case values of a or b = 1 or 0 are done.
192 */
193 static LLVMValueRef
194 lp_build_max_simple(struct lp_build_context *bld,
195 LLVMValueRef a,
196 LLVMValueRef b)
197 {
198 const struct lp_type type = bld->type;
199 const char *intrinsic = NULL;
200 unsigned intr_size = 0;
201 LLVMValueRef cond;
202
203 assert(lp_check_value(type, a));
204 assert(lp_check_value(type, b));
205
206 /* TODO: optimize the constant case */
207
208 if (type.floating && util_cpu_caps.has_sse) {
209 if (type.width == 32) {
210 if (type.length == 1) {
211 intrinsic = "llvm.x86.sse.max.ss";
212 intr_size = 128;
213 }
214 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
215 intrinsic = "llvm.x86.sse.max.ps";
216 intr_size = 128;
217 }
218 else {
219 intrinsic = "llvm.x86.avx.max.ps.256";
220 intr_size = 256;
221 }
222 }
223 if (type.width == 64 && util_cpu_caps.has_sse2) {
224 if (type.length == 1) {
225 intrinsic = "llvm.x86.sse2.max.sd";
226 intr_size = 128;
227 }
228 else if (type.length == 2 || !util_cpu_caps.has_avx) {
229 intrinsic = "llvm.x86.sse2.max.pd";
230 intr_size = 128;
231 }
232 else {
233 intrinsic = "llvm.x86.avx.max.pd.256";
234 intr_size = 256;
235 }
236 }
237 }
238 else if (type.floating && util_cpu_caps.has_altivec) {
239 if (type.width == 32 || type.length == 4) {
240 intrinsic = "llvm.ppc.altivec.vmaxfp";
241 intr_size = 128;
242 }
243 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
244 intr_size = 128;
245 if ((type.width == 8 || type.width == 16) &&
246 (type.width * type.length <= 64) &&
247 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
248 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
249 __FUNCTION__);
250 }
251 if (type.width == 8 && !type.sign) {
252 intrinsic = "llvm.x86.sse2.pmaxu.b";
253 intr_size = 128;
254 }
255 else if (type.width == 16 && type.sign) {
256 intrinsic = "llvm.x86.sse2.pmaxs.w";
257 }
258 if (util_cpu_caps.has_sse4_1) {
259 if (type.width == 8 && type.sign) {
260 intrinsic = "llvm.x86.sse41.pmaxsb";
261 }
262 if (type.width == 16 && !type.sign) {
263 intrinsic = "llvm.x86.sse41.pmaxuw";
264 }
265 if (type.width == 32 && !type.sign) {
266 intrinsic = "llvm.x86.sse41.pmaxud";
267 }
268 if (type.width == 32 && type.sign) {
269 intrinsic = "llvm.x86.sse41.pmaxsd";
270 }
271 }
272 } else if (util_cpu_caps.has_altivec) {
273 intr_size = 128;
274 if (type.width == 8) {
275 if (!type.sign) {
276 intrinsic = "llvm.ppc.altivec.vmaxub";
277 } else {
278 intrinsic = "llvm.ppc.altivec.vmaxsb";
279 }
280 } else if (type.width == 16) {
281 if (!type.sign) {
282 intrinsic = "llvm.ppc.altivec.vmaxuh";
283 } else {
284 intrinsic = "llvm.ppc.altivec.vmaxsh";
285 }
286 } else if (type.width == 32) {
287 if (!type.sign) {
288 intrinsic = "llvm.ppc.altivec.vmaxuw";
289 } else {
290 intrinsic = "llvm.ppc.altivec.vmaxsw";
291 }
292 }
293 }
294
295 if(intrinsic) {
296 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
297 type,
298 intr_size, a, b);
299 }
300
301 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
302 return lp_build_select(bld, cond, a, b);
303 }
304
305
306 /**
307 * Generate 1 - a, or ~a depending on bld->type.
308 */
309 LLVMValueRef
310 lp_build_comp(struct lp_build_context *bld,
311 LLVMValueRef a)
312 {
313 LLVMBuilderRef builder = bld->gallivm->builder;
314 const struct lp_type type = bld->type;
315
316 assert(lp_check_value(type, a));
317
318 if(a == bld->one)
319 return bld->zero;
320 if(a == bld->zero)
321 return bld->one;
322
323 if(type.norm && !type.floating && !type.fixed && !type.sign) {
324 if(LLVMIsConstant(a))
325 return LLVMConstNot(a);
326 else
327 return LLVMBuildNot(builder, a, "");
328 }
329
330 if(LLVMIsConstant(a))
331 if (type.floating)
332 return LLVMConstFSub(bld->one, a);
333 else
334 return LLVMConstSub(bld->one, a);
335 else
336 if (type.floating)
337 return LLVMBuildFSub(builder, bld->one, a, "");
338 else
339 return LLVMBuildSub(builder, bld->one, a, "");
340 }
341
342
343 /**
344 * Generate a + b
345 */
346 LLVMValueRef
347 lp_build_add(struct lp_build_context *bld,
348 LLVMValueRef a,
349 LLVMValueRef b)
350 {
351 LLVMBuilderRef builder = bld->gallivm->builder;
352 const struct lp_type type = bld->type;
353 LLVMValueRef res;
354
355 assert(lp_check_value(type, a));
356 assert(lp_check_value(type, b));
357
358 if(a == bld->zero)
359 return b;
360 if(b == bld->zero)
361 return a;
362 if(a == bld->undef || b == bld->undef)
363 return bld->undef;
364
365 if(bld->type.norm) {
366 const char *intrinsic = NULL;
367
368 if(a == bld->one || b == bld->one)
369 return bld->one;
370
371 if (type.width * type.length == 128 &&
372 !type.floating && !type.fixed) {
373 if(util_cpu_caps.has_sse2) {
374 if(type.width == 8)
375 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
376 if(type.width == 16)
377 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
378 } else if (util_cpu_caps.has_altivec) {
379 if(type.width == 8)
380 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
381 if(type.width == 16)
382 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
383 }
384 }
385
386 if(intrinsic)
387 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
388 }
389
390 /* TODO: handle signed case */
391 if(type.norm && !type.floating && !type.fixed && !type.sign)
392 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b));
393
394 if(LLVMIsConstant(a) && LLVMIsConstant(b))
395 if (type.floating)
396 res = LLVMConstFAdd(a, b);
397 else
398 res = LLVMConstAdd(a, b);
399 else
400 if (type.floating)
401 res = LLVMBuildFAdd(builder, a, b, "");
402 else
403 res = LLVMBuildAdd(builder, a, b, "");
404
405 /* clamp to ceiling of 1.0 */
406 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
407 res = lp_build_min_simple(bld, res, bld->one);
408
409 /* XXX clamp to floor of -1 or 0??? */
410
411 return res;
412 }
413
414
415 /** Return the scalar sum of the elements of a.
416 * Should avoid this operation whenever possible.
417 */
418 LLVMValueRef
419 lp_build_horizontal_add(struct lp_build_context *bld,
420 LLVMValueRef a)
421 {
422 LLVMBuilderRef builder = bld->gallivm->builder;
423 const struct lp_type type = bld->type;
424 LLVMValueRef index, res;
425 unsigned i, length;
426 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
427 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
428 LLVMValueRef vecres, elem2;
429
430 assert(lp_check_value(type, a));
431
432 if (type.length == 1) {
433 return a;
434 }
435
436 assert(!bld->type.norm);
437
438 /*
439 * for byte vectors can do much better with psadbw.
440 * Using repeated shuffle/adds here. Note with multiple vectors
441 * this can be done more efficiently as outlined in the intel
442 * optimization manual.
443 * Note: could cause data rearrangement if used with smaller element
444 * sizes.
445 */
446
447 vecres = a;
448 length = type.length / 2;
449 while (length > 1) {
450 LLVMValueRef vec1, vec2;
451 for (i = 0; i < length; i++) {
452 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
453 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
454 }
455 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
456 LLVMConstVector(shuffles1, length), "");
457 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
458 LLVMConstVector(shuffles2, length), "");
459 if (type.floating) {
460 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
461 }
462 else {
463 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
464 }
465 length = length >> 1;
466 }
467
468 /* always have vector of size 2 here */
469 assert(length == 1);
470
471 index = lp_build_const_int32(bld->gallivm, 0);
472 res = LLVMBuildExtractElement(builder, vecres, index, "");
473 index = lp_build_const_int32(bld->gallivm, 1);
474 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
475
476 if (type.floating)
477 res = LLVMBuildFAdd(builder, res, elem2, "");
478 else
479 res = LLVMBuildAdd(builder, res, elem2, "");
480
481 return res;
482 }
483
484 /**
485 * Return the horizontal sums of 4 float vectors as a float4 vector.
486 * This uses the technique as outlined in Intel Optimization Manual.
487 */
488 static LLVMValueRef
489 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
490 LLVMValueRef src[4])
491 {
492 struct gallivm_state *gallivm = bld->gallivm;
493 LLVMBuilderRef builder = gallivm->builder;
494 LLVMValueRef shuffles[4];
495 LLVMValueRef tmp[4];
496 LLVMValueRef sumtmp[2], shuftmp[2];
497
498 /* lower half of regs */
499 shuffles[0] = lp_build_const_int32(gallivm, 0);
500 shuffles[1] = lp_build_const_int32(gallivm, 1);
501 shuffles[2] = lp_build_const_int32(gallivm, 4);
502 shuffles[3] = lp_build_const_int32(gallivm, 5);
503 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
504 LLVMConstVector(shuffles, 4), "");
505 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
506 LLVMConstVector(shuffles, 4), "");
507
508 /* upper half of regs */
509 shuffles[0] = lp_build_const_int32(gallivm, 2);
510 shuffles[1] = lp_build_const_int32(gallivm, 3);
511 shuffles[2] = lp_build_const_int32(gallivm, 6);
512 shuffles[3] = lp_build_const_int32(gallivm, 7);
513 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
514 LLVMConstVector(shuffles, 4), "");
515 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
516 LLVMConstVector(shuffles, 4), "");
517
518 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
519 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
520
521 shuffles[0] = lp_build_const_int32(gallivm, 0);
522 shuffles[1] = lp_build_const_int32(gallivm, 2);
523 shuffles[2] = lp_build_const_int32(gallivm, 4);
524 shuffles[3] = lp_build_const_int32(gallivm, 6);
525 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
526 LLVMConstVector(shuffles, 4), "");
527
528 shuffles[0] = lp_build_const_int32(gallivm, 1);
529 shuffles[1] = lp_build_const_int32(gallivm, 3);
530 shuffles[2] = lp_build_const_int32(gallivm, 5);
531 shuffles[3] = lp_build_const_int32(gallivm, 7);
532 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
533 LLVMConstVector(shuffles, 4), "");
534
535 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
536 }
537
538
539 /*
540 * partially horizontally add 2-4 float vectors with length nx4,
541 * i.e. only four adjacent values in each vector will be added,
542 * assuming values are really grouped in 4 which also determines
543 * output order.
544 *
545 * Return a vector of the same length as the initial vectors,
546 * with the excess elements (if any) being undefined.
547 * The element order is independent of number of input vectors.
548 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
549 * the output order thus will be
550 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
551 */
552 LLVMValueRef
553 lp_build_hadd_partial4(struct lp_build_context *bld,
554 LLVMValueRef vectors[],
555 unsigned num_vecs)
556 {
557 struct gallivm_state *gallivm = bld->gallivm;
558 LLVMBuilderRef builder = gallivm->builder;
559 LLVMValueRef ret_vec;
560 LLVMValueRef tmp[4];
561 const char *intrinsic = NULL;
562
563 assert(num_vecs >= 2 && num_vecs <= 4);
564 assert(bld->type.floating);
565
566 /* only use this with at least 2 vectors, as it is sort of expensive
567 * (depending on cpu) and we always need two horizontal adds anyway,
568 * so a shuffle/add approach might be better.
569 */
570
571 tmp[0] = vectors[0];
572 tmp[1] = vectors[1];
573
574 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
575 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
576
577 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
578 bld->type.length == 4) {
579 intrinsic = "llvm.x86.sse3.hadd.ps";
580 }
581 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
582 bld->type.length == 8) {
583 intrinsic = "llvm.x86.avx.hadd.ps.256";
584 }
585 if (intrinsic) {
586 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
587 lp_build_vec_type(gallivm, bld->type),
588 tmp[0], tmp[1]);
589 if (num_vecs > 2) {
590 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
591 lp_build_vec_type(gallivm, bld->type),
592 tmp[2], tmp[3]);
593 }
594 else {
595 tmp[1] = tmp[0];
596 }
597 return lp_build_intrinsic_binary(builder, intrinsic,
598 lp_build_vec_type(gallivm, bld->type),
599 tmp[0], tmp[1]);
600 }
601
602 if (bld->type.length == 4) {
603 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
604 }
605 else {
606 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
607 unsigned j;
608 unsigned num_iter = bld->type.length / 4;
609 struct lp_type parttype = bld->type;
610 parttype.length = 4;
611 for (j = 0; j < num_iter; j++) {
612 LLVMValueRef partsrc[4];
613 unsigned i;
614 for (i = 0; i < 4; i++) {
615 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
616 }
617 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
618 }
619 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
620 }
621 return ret_vec;
622 }
623
624 /**
625 * Generate a - b
626 */
627 LLVMValueRef
628 lp_build_sub(struct lp_build_context *bld,
629 LLVMValueRef a,
630 LLVMValueRef b)
631 {
632 LLVMBuilderRef builder = bld->gallivm->builder;
633 const struct lp_type type = bld->type;
634 LLVMValueRef res;
635
636 assert(lp_check_value(type, a));
637 assert(lp_check_value(type, b));
638
639 if(b == bld->zero)
640 return a;
641 if(a == bld->undef || b == bld->undef)
642 return bld->undef;
643 if(a == b)
644 return bld->zero;
645
646 if(bld->type.norm) {
647 const char *intrinsic = NULL;
648
649 if(b == bld->one)
650 return bld->zero;
651
652 if (type.width * type.length == 128 &&
653 !type.floating && !type.fixed) {
654 if (util_cpu_caps.has_sse2) {
655 if(type.width == 8)
656 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
657 if(type.width == 16)
658 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
659 } else if (util_cpu_caps.has_altivec) {
660 if(type.width == 8)
661 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
662 if(type.width == 16)
663 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
664 }
665 }
666
667 if(intrinsic)
668 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
669 }
670
671 /* TODO: handle signed case */
672 if(type.norm && !type.floating && !type.fixed && !type.sign)
673 a = lp_build_max_simple(bld, a, b);
674
675 if(LLVMIsConstant(a) && LLVMIsConstant(b))
676 if (type.floating)
677 res = LLVMConstFSub(a, b);
678 else
679 res = LLVMConstSub(a, b);
680 else
681 if (type.floating)
682 res = LLVMBuildFSub(builder, a, b, "");
683 else
684 res = LLVMBuildSub(builder, a, b, "");
685
686 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
687 res = lp_build_max_simple(bld, res, bld->zero);
688
689 return res;
690 }
691
692
693
694 /**
695 * Normalized multiplication.
696 *
697 * There are several approaches for (using 8-bit normalized multiplication as
698 * an example):
699 *
700 * - alpha plus one
701 *
702 * makes the following approximation to the division (Sree)
703 *
704 * a*b/255 ~= (a*(b + 1)) >> 256
705 *
706 * which is the fastest method that satisfies the following OpenGL criteria of
707 *
708 * 0*0 = 0 and 255*255 = 255
709 *
710 * - geometric series
711 *
712 * takes the geometric series approximation to the division
713 *
714 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
715 *
716 * in this case just the first two terms to fit in 16bit arithmetic
717 *
718 * t/255 ~= (t + (t >> 8)) >> 8
719 *
720 * note that just by itself it doesn't satisfies the OpenGL criteria, as
721 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
722 * must be used.
723 *
724 * - geometric series plus rounding
725 *
726 * when using a geometric series division instead of truncating the result
727 * use roundoff in the approximation (Jim Blinn)
728 *
729 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
730 *
731 * achieving the exact results.
732 *
733 *
734 *
735 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
736 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
737 * @sa Michael Herf, The "double blend trick", May 2000,
738 * http://www.stereopsis.com/doubleblend.html
739 */
740 static LLVMValueRef
741 lp_build_mul_norm(struct gallivm_state *gallivm,
742 struct lp_type wide_type,
743 LLVMValueRef a, LLVMValueRef b)
744 {
745 LLVMBuilderRef builder = gallivm->builder;
746 struct lp_build_context bld;
747 unsigned n;
748 LLVMValueRef half;
749 LLVMValueRef ab;
750
751 assert(!wide_type.floating);
752 assert(lp_check_value(wide_type, a));
753 assert(lp_check_value(wide_type, b));
754
755 lp_build_context_init(&bld, gallivm, wide_type);
756
757 n = wide_type.width / 2;
758 if (wide_type.sign) {
759 --n;
760 }
761
762 /*
763 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
764 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
765 */
766
767 /*
768 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
769 */
770
771 ab = LLVMBuildMul(builder, a, b, "");
772 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
773
774 /*
775 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
776 */
777
778 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
779 if (wide_type.sign) {
780 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
781 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
782 half = lp_build_select(&bld, sign, minus_half, half);
783 }
784 ab = LLVMBuildAdd(builder, ab, half, "");
785
786 /* Final division */
787 ab = lp_build_shr_imm(&bld, ab, n);
788
789 return ab;
790 }
791
792 /**
793 * Generate a * b
794 */
795 LLVMValueRef
796 lp_build_mul(struct lp_build_context *bld,
797 LLVMValueRef a,
798 LLVMValueRef b)
799 {
800 LLVMBuilderRef builder = bld->gallivm->builder;
801 const struct lp_type type = bld->type;
802 LLVMValueRef shift;
803 LLVMValueRef res;
804
805 assert(lp_check_value(type, a));
806 assert(lp_check_value(type, b));
807
808 if(a == bld->zero)
809 return bld->zero;
810 if(a == bld->one)
811 return b;
812 if(b == bld->zero)
813 return bld->zero;
814 if(b == bld->one)
815 return a;
816 if(a == bld->undef || b == bld->undef)
817 return bld->undef;
818
819 if (!type.floating && !type.fixed && type.norm) {
820 struct lp_type wide_type = lp_wider_type(type);
821 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
822
823 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
824 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
825
826 /* PMULLW, PSRLW, PADDW */
827 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
828 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
829
830 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
831
832 return ab;
833 }
834
835 if(type.fixed)
836 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
837 else
838 shift = NULL;
839
840 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
841 if (type.floating)
842 res = LLVMConstFMul(a, b);
843 else
844 res = LLVMConstMul(a, b);
845 if(shift) {
846 if(type.sign)
847 res = LLVMConstAShr(res, shift);
848 else
849 res = LLVMConstLShr(res, shift);
850 }
851 }
852 else {
853 if (type.floating)
854 res = LLVMBuildFMul(builder, a, b, "");
855 else
856 res = LLVMBuildMul(builder, a, b, "");
857 if(shift) {
858 if(type.sign)
859 res = LLVMBuildAShr(builder, res, shift, "");
860 else
861 res = LLVMBuildLShr(builder, res, shift, "");
862 }
863 }
864
865 return res;
866 }
867
868
869 /**
870 * Small vector x scale multiplication optimization.
871 */
872 LLVMValueRef
873 lp_build_mul_imm(struct lp_build_context *bld,
874 LLVMValueRef a,
875 int b)
876 {
877 LLVMBuilderRef builder = bld->gallivm->builder;
878 LLVMValueRef factor;
879
880 assert(lp_check_value(bld->type, a));
881
882 if(b == 0)
883 return bld->zero;
884
885 if(b == 1)
886 return a;
887
888 if(b == -1)
889 return lp_build_negate(bld, a);
890
891 if(b == 2 && bld->type.floating)
892 return lp_build_add(bld, a, a);
893
894 if(util_is_power_of_two(b)) {
895 unsigned shift = ffs(b) - 1;
896
897 if(bld->type.floating) {
898 #if 0
899 /*
900 * Power of two multiplication by directly manipulating the exponent.
901 *
902 * XXX: This might not be always faster, it will introduce a small error
903 * for multiplication by zero, and it will produce wrong results
904 * for Inf and NaN.
905 */
906 unsigned mantissa = lp_mantissa(bld->type);
907 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
908 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
909 a = LLVMBuildAdd(builder, a, factor, "");
910 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
911 return a;
912 #endif
913 }
914 else {
915 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
916 return LLVMBuildShl(builder, a, factor, "");
917 }
918 }
919
920 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
921 return lp_build_mul(bld, a, factor);
922 }
923
924
925 /**
926 * Generate a / b
927 */
928 LLVMValueRef
929 lp_build_div(struct lp_build_context *bld,
930 LLVMValueRef a,
931 LLVMValueRef b)
932 {
933 LLVMBuilderRef builder = bld->gallivm->builder;
934 const struct lp_type type = bld->type;
935
936 assert(lp_check_value(type, a));
937 assert(lp_check_value(type, b));
938
939 if(a == bld->zero)
940 return bld->zero;
941 if(a == bld->one)
942 return lp_build_rcp(bld, b);
943 if(b == bld->zero)
944 return bld->undef;
945 if(b == bld->one)
946 return a;
947 if(a == bld->undef || b == bld->undef)
948 return bld->undef;
949
950 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
951 if (type.floating)
952 return LLVMConstFDiv(a, b);
953 else if (type.sign)
954 return LLVMConstSDiv(a, b);
955 else
956 return LLVMConstUDiv(a, b);
957 }
958
959 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
960 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
961 type.floating)
962 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
963
964 if (type.floating)
965 return LLVMBuildFDiv(builder, a, b, "");
966 else if (type.sign)
967 return LLVMBuildSDiv(builder, a, b, "");
968 else
969 return LLVMBuildUDiv(builder, a, b, "");
970 }
971
972
973 /**
974 * Linear interpolation helper.
975 *
976 * @param normalized whether we are interpolating normalized values,
977 * encoded in normalized integers, twice as wide.
978 *
979 * @sa http://www.stereopsis.com/doubleblend.html
980 */
981 static INLINE LLVMValueRef
982 lp_build_lerp_simple(struct lp_build_context *bld,
983 LLVMValueRef x,
984 LLVMValueRef v0,
985 LLVMValueRef v1,
986 unsigned flags)
987 {
988 unsigned half_width = bld->type.width/2;
989 LLVMBuilderRef builder = bld->gallivm->builder;
990 LLVMValueRef delta;
991 LLVMValueRef res;
992
993 assert(lp_check_value(bld->type, x));
994 assert(lp_check_value(bld->type, v0));
995 assert(lp_check_value(bld->type, v1));
996
997 delta = lp_build_sub(bld, v1, v0);
998
999 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1000 if (!bld->type.sign) {
1001 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1002 /*
1003 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1004 * most-significant-bit to the lowest-significant-bit, so that
1005 * later we can just divide by 2**n instead of 2**n - 1.
1006 */
1007
1008 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1009 }
1010
1011 /* (x * delta) >> n */
1012 res = lp_build_mul(bld, x, delta);
1013 res = lp_build_shr_imm(bld, res, half_width);
1014 } else {
1015 /*
1016 * The rescaling trick above doesn't work for signed numbers, so
1017 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1018 * instead.
1019 */
1020 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1021 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1022 }
1023 } else {
1024 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1025 res = lp_build_mul(bld, x, delta);
1026 }
1027
1028 res = lp_build_add(bld, v0, res);
1029
1030 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1031 bld->type.fixed) {
1032 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1033 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1034 * but it will be wrong for true fixed point use cases. Basically we need
1035 * a more powerful lp_type, capable of further distinguishing the values
1036 * interpretation from the value storage. */
1037 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1038 }
1039
1040 return res;
1041 }
1042
1043
1044 /**
1045 * Linear interpolation.
1046 */
1047 LLVMValueRef
1048 lp_build_lerp(struct lp_build_context *bld,
1049 LLVMValueRef x,
1050 LLVMValueRef v0,
1051 LLVMValueRef v1,
1052 unsigned flags)
1053 {
1054 const struct lp_type type = bld->type;
1055 LLVMValueRef res;
1056
1057 assert(lp_check_value(type, x));
1058 assert(lp_check_value(type, v0));
1059 assert(lp_check_value(type, v1));
1060
1061 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1062
1063 if (type.norm) {
1064 struct lp_type wide_type;
1065 struct lp_build_context wide_bld;
1066 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1067
1068 assert(type.length >= 2);
1069
1070 /*
1071 * Create a wider integer type, enough to hold the
1072 * intermediate result of the multiplication.
1073 */
1074 memset(&wide_type, 0, sizeof wide_type);
1075 wide_type.sign = type.sign;
1076 wide_type.width = type.width*2;
1077 wide_type.length = type.length/2;
1078
1079 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1080
1081 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1082 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1083 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1084
1085 /*
1086 * Lerp both halves.
1087 */
1088
1089 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1090
1091 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1092 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1093
1094 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1095 } else {
1096 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1097 }
1098
1099 return res;
1100 }
1101
1102
1103 /**
1104 * Bilinear interpolation.
1105 *
1106 * Values indices are in v_{yx}.
1107 */
1108 LLVMValueRef
1109 lp_build_lerp_2d(struct lp_build_context *bld,
1110 LLVMValueRef x,
1111 LLVMValueRef y,
1112 LLVMValueRef v00,
1113 LLVMValueRef v01,
1114 LLVMValueRef v10,
1115 LLVMValueRef v11,
1116 unsigned flags)
1117 {
1118 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1119 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1120 return lp_build_lerp(bld, y, v0, v1, flags);
1121 }
1122
1123
1124 LLVMValueRef
1125 lp_build_lerp_3d(struct lp_build_context *bld,
1126 LLVMValueRef x,
1127 LLVMValueRef y,
1128 LLVMValueRef z,
1129 LLVMValueRef v000,
1130 LLVMValueRef v001,
1131 LLVMValueRef v010,
1132 LLVMValueRef v011,
1133 LLVMValueRef v100,
1134 LLVMValueRef v101,
1135 LLVMValueRef v110,
1136 LLVMValueRef v111,
1137 unsigned flags)
1138 {
1139 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1140 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1141 return lp_build_lerp(bld, z, v0, v1, flags);
1142 }
1143
1144
1145 /**
1146 * Generate min(a, b)
1147 * Do checks for special cases.
1148 */
1149 LLVMValueRef
1150 lp_build_min(struct lp_build_context *bld,
1151 LLVMValueRef a,
1152 LLVMValueRef b)
1153 {
1154 assert(lp_check_value(bld->type, a));
1155 assert(lp_check_value(bld->type, b));
1156
1157 if(a == bld->undef || b == bld->undef)
1158 return bld->undef;
1159
1160 if(a == b)
1161 return a;
1162
1163 if (bld->type.norm) {
1164 if (!bld->type.sign) {
1165 if (a == bld->zero || b == bld->zero) {
1166 return bld->zero;
1167 }
1168 }
1169 if(a == bld->one)
1170 return b;
1171 if(b == bld->one)
1172 return a;
1173 }
1174
1175 return lp_build_min_simple(bld, a, b);
1176 }
1177
1178
1179 /**
1180 * Generate max(a, b)
1181 * Do checks for special cases.
1182 */
1183 LLVMValueRef
1184 lp_build_max(struct lp_build_context *bld,
1185 LLVMValueRef a,
1186 LLVMValueRef b)
1187 {
1188 assert(lp_check_value(bld->type, a));
1189 assert(lp_check_value(bld->type, b));
1190
1191 if(a == bld->undef || b == bld->undef)
1192 return bld->undef;
1193
1194 if(a == b)
1195 return a;
1196
1197 if(bld->type.norm) {
1198 if(a == bld->one || b == bld->one)
1199 return bld->one;
1200 if (!bld->type.sign) {
1201 if (a == bld->zero) {
1202 return b;
1203 }
1204 if (b == bld->zero) {
1205 return a;
1206 }
1207 }
1208 }
1209
1210 return lp_build_max_simple(bld, a, b);
1211 }
1212
1213
1214 /**
1215 * Generate clamp(a, min, max)
1216 * Do checks for special cases.
1217 */
1218 LLVMValueRef
1219 lp_build_clamp(struct lp_build_context *bld,
1220 LLVMValueRef a,
1221 LLVMValueRef min,
1222 LLVMValueRef max)
1223 {
1224 assert(lp_check_value(bld->type, a));
1225 assert(lp_check_value(bld->type, min));
1226 assert(lp_check_value(bld->type, max));
1227
1228 a = lp_build_min(bld, a, max);
1229 a = lp_build_max(bld, a, min);
1230 return a;
1231 }
1232
1233
1234 /**
1235 * Generate abs(a)
1236 */
1237 LLVMValueRef
1238 lp_build_abs(struct lp_build_context *bld,
1239 LLVMValueRef a)
1240 {
1241 LLVMBuilderRef builder = bld->gallivm->builder;
1242 const struct lp_type type = bld->type;
1243 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1244
1245 assert(lp_check_value(type, a));
1246
1247 if(!type.sign)
1248 return a;
1249
1250 if(type.floating) {
1251 /* Mask out the sign bit */
1252 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1253 unsigned long long absMask = ~(1ULL << (type.width - 1));
1254 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1255 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1256 a = LLVMBuildAnd(builder, a, mask, "");
1257 a = LLVMBuildBitCast(builder, a, vec_type, "");
1258 return a;
1259 }
1260
1261 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1262 switch(type.width) {
1263 case 8:
1264 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1265 case 16:
1266 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1267 case 32:
1268 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1269 }
1270 }
1271 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1272 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1273 (type.width == 8 || type.width == 16 || type.width == 32)) {
1274 debug_printf("%s: inefficient code, should split vectors manually\n",
1275 __FUNCTION__);
1276 }
1277
1278 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1279 }
1280
1281
1282 LLVMValueRef
1283 lp_build_negate(struct lp_build_context *bld,
1284 LLVMValueRef a)
1285 {
1286 LLVMBuilderRef builder = bld->gallivm->builder;
1287
1288 assert(lp_check_value(bld->type, a));
1289
1290 #if HAVE_LLVM >= 0x0207
1291 if (bld->type.floating)
1292 a = LLVMBuildFNeg(builder, a, "");
1293 else
1294 #endif
1295 a = LLVMBuildNeg(builder, a, "");
1296
1297 return a;
1298 }
1299
1300
1301 /** Return -1, 0 or +1 depending on the sign of a */
1302 LLVMValueRef
1303 lp_build_sgn(struct lp_build_context *bld,
1304 LLVMValueRef a)
1305 {
1306 LLVMBuilderRef builder = bld->gallivm->builder;
1307 const struct lp_type type = bld->type;
1308 LLVMValueRef cond;
1309 LLVMValueRef res;
1310
1311 assert(lp_check_value(type, a));
1312
1313 /* Handle non-zero case */
1314 if(!type.sign) {
1315 /* if not zero then sign must be positive */
1316 res = bld->one;
1317 }
1318 else if(type.floating) {
1319 LLVMTypeRef vec_type;
1320 LLVMTypeRef int_type;
1321 LLVMValueRef mask;
1322 LLVMValueRef sign;
1323 LLVMValueRef one;
1324 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1325
1326 int_type = lp_build_int_vec_type(bld->gallivm, type);
1327 vec_type = lp_build_vec_type(bld->gallivm, type);
1328 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1329
1330 /* Take the sign bit and add it to 1 constant */
1331 sign = LLVMBuildBitCast(builder, a, int_type, "");
1332 sign = LLVMBuildAnd(builder, sign, mask, "");
1333 one = LLVMConstBitCast(bld->one, int_type);
1334 res = LLVMBuildOr(builder, sign, one, "");
1335 res = LLVMBuildBitCast(builder, res, vec_type, "");
1336 }
1337 else
1338 {
1339 /* signed int/norm/fixed point */
1340 /* could use psign with sse3 and appropriate vectors here */
1341 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1342 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1343 res = lp_build_select(bld, cond, bld->one, minus_one);
1344 }
1345
1346 /* Handle zero */
1347 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1348 res = lp_build_select(bld, cond, bld->zero, res);
1349
1350 return res;
1351 }
1352
1353
1354 /**
1355 * Set the sign of float vector 'a' according to 'sign'.
1356 * If sign==0, return abs(a).
1357 * If sign==1, return -abs(a);
1358 * Other values for sign produce undefined results.
1359 */
1360 LLVMValueRef
1361 lp_build_set_sign(struct lp_build_context *bld,
1362 LLVMValueRef a, LLVMValueRef sign)
1363 {
1364 LLVMBuilderRef builder = bld->gallivm->builder;
1365 const struct lp_type type = bld->type;
1366 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1367 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1368 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1369 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1370 ~((unsigned long long) 1 << (type.width - 1)));
1371 LLVMValueRef val, res;
1372
1373 assert(type.floating);
1374 assert(lp_check_value(type, a));
1375
1376 /* val = reinterpret_cast<int>(a) */
1377 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1378 /* val = val & mask */
1379 val = LLVMBuildAnd(builder, val, mask, "");
1380 /* sign = sign << shift */
1381 sign = LLVMBuildShl(builder, sign, shift, "");
1382 /* res = val | sign */
1383 res = LLVMBuildOr(builder, val, sign, "");
1384 /* res = reinterpret_cast<float>(res) */
1385 res = LLVMBuildBitCast(builder, res, vec_type, "");
1386
1387 return res;
1388 }
1389
1390
1391 /**
1392 * Convert vector of (or scalar) int to vector of (or scalar) float.
1393 */
1394 LLVMValueRef
1395 lp_build_int_to_float(struct lp_build_context *bld,
1396 LLVMValueRef a)
1397 {
1398 LLVMBuilderRef builder = bld->gallivm->builder;
1399 const struct lp_type type = bld->type;
1400 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1401
1402 assert(type.floating);
1403
1404 return LLVMBuildSIToFP(builder, a, vec_type, "");
1405 }
1406
1407 static boolean
1408 arch_rounding_available(const struct lp_type type)
1409 {
1410 if ((util_cpu_caps.has_sse4_1 &&
1411 (type.length == 1 || type.width*type.length == 128)) ||
1412 (util_cpu_caps.has_avx && type.width*type.length == 256))
1413 return TRUE;
1414 else if ((util_cpu_caps.has_altivec &&
1415 (type.width == 32 && type.length == 4)))
1416 return TRUE;
1417
1418 return FALSE;
1419 }
1420
1421 enum lp_build_round_mode
1422 {
1423 LP_BUILD_ROUND_NEAREST = 0,
1424 LP_BUILD_ROUND_FLOOR = 1,
1425 LP_BUILD_ROUND_CEIL = 2,
1426 LP_BUILD_ROUND_TRUNCATE = 3
1427 };
1428
1429 /**
1430 * Helper for SSE4.1's ROUNDxx instructions.
1431 *
1432 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1433 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1434 */
1435 static INLINE LLVMValueRef
1436 lp_build_round_sse41(struct lp_build_context *bld,
1437 LLVMValueRef a,
1438 enum lp_build_round_mode mode)
1439 {
1440 LLVMBuilderRef builder = bld->gallivm->builder;
1441 const struct lp_type type = bld->type;
1442 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1443 const char *intrinsic;
1444 LLVMValueRef res;
1445
1446 assert(type.floating);
1447
1448 assert(lp_check_value(type, a));
1449 assert(util_cpu_caps.has_sse4_1);
1450
1451 if (type.length == 1) {
1452 LLVMTypeRef vec_type;
1453 LLVMValueRef undef;
1454 LLVMValueRef args[3];
1455 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1456
1457 switch(type.width) {
1458 case 32:
1459 intrinsic = "llvm.x86.sse41.round.ss";
1460 break;
1461 case 64:
1462 intrinsic = "llvm.x86.sse41.round.sd";
1463 break;
1464 default:
1465 assert(0);
1466 return bld->undef;
1467 }
1468
1469 vec_type = LLVMVectorType(bld->elem_type, 4);
1470
1471 undef = LLVMGetUndef(vec_type);
1472
1473 args[0] = undef;
1474 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1475 args[2] = LLVMConstInt(i32t, mode, 0);
1476
1477 res = lp_build_intrinsic(builder, intrinsic,
1478 vec_type, args, Elements(args));
1479
1480 res = LLVMBuildExtractElement(builder, res, index0, "");
1481 }
1482 else {
1483 if (type.width * type.length == 128) {
1484 switch(type.width) {
1485 case 32:
1486 intrinsic = "llvm.x86.sse41.round.ps";
1487 break;
1488 case 64:
1489 intrinsic = "llvm.x86.sse41.round.pd";
1490 break;
1491 default:
1492 assert(0);
1493 return bld->undef;
1494 }
1495 }
1496 else {
1497 assert(type.width * type.length == 256);
1498 assert(util_cpu_caps.has_avx);
1499
1500 switch(type.width) {
1501 case 32:
1502 intrinsic = "llvm.x86.avx.round.ps.256";
1503 break;
1504 case 64:
1505 intrinsic = "llvm.x86.avx.round.pd.256";
1506 break;
1507 default:
1508 assert(0);
1509 return bld->undef;
1510 }
1511 }
1512
1513 res = lp_build_intrinsic_binary(builder, intrinsic,
1514 bld->vec_type, a,
1515 LLVMConstInt(i32t, mode, 0));
1516 }
1517
1518 return res;
1519 }
1520
1521
1522 static INLINE LLVMValueRef
1523 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1524 LLVMValueRef a)
1525 {
1526 LLVMBuilderRef builder = bld->gallivm->builder;
1527 const struct lp_type type = bld->type;
1528 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1529 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1530 const char *intrinsic;
1531 LLVMValueRef res;
1532
1533 assert(type.floating);
1534 /* using the double precision conversions is a bit more complicated */
1535 assert(type.width == 32);
1536
1537 assert(lp_check_value(type, a));
1538 assert(util_cpu_caps.has_sse2);
1539
1540 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1541 if (type.length == 1) {
1542 LLVMTypeRef vec_type;
1543 LLVMValueRef undef;
1544 LLVMValueRef arg;
1545 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1546
1547 vec_type = LLVMVectorType(bld->elem_type, 4);
1548
1549 intrinsic = "llvm.x86.sse.cvtss2si";
1550
1551 undef = LLVMGetUndef(vec_type);
1552
1553 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1554
1555 res = lp_build_intrinsic_unary(builder, intrinsic,
1556 ret_type, arg);
1557 }
1558 else {
1559 if (type.width* type.length == 128) {
1560 intrinsic = "llvm.x86.sse2.cvtps2dq";
1561 }
1562 else {
1563 assert(type.width*type.length == 256);
1564 assert(util_cpu_caps.has_avx);
1565
1566 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1567 }
1568 res = lp_build_intrinsic_unary(builder, intrinsic,
1569 ret_type, a);
1570 }
1571
1572 return res;
1573 }
1574
1575
1576 /*
1577 */
1578 static INLINE LLVMValueRef
1579 lp_build_round_altivec(struct lp_build_context *bld,
1580 LLVMValueRef a,
1581 enum lp_build_round_mode mode)
1582 {
1583 LLVMBuilderRef builder = bld->gallivm->builder;
1584 const struct lp_type type = bld->type;
1585 const char *intrinsic = NULL;
1586
1587 assert(type.floating);
1588
1589 assert(lp_check_value(type, a));
1590 assert(util_cpu_caps.has_altivec);
1591
1592 switch (mode) {
1593 case LP_BUILD_ROUND_NEAREST:
1594 intrinsic = "llvm.ppc.altivec.vrfin";
1595 break;
1596 case LP_BUILD_ROUND_FLOOR:
1597 intrinsic = "llvm.ppc.altivec.vrfim";
1598 break;
1599 case LP_BUILD_ROUND_CEIL:
1600 intrinsic = "llvm.ppc.altivec.vrfip";
1601 break;
1602 case LP_BUILD_ROUND_TRUNCATE:
1603 intrinsic = "llvm.ppc.altivec.vrfiz";
1604 break;
1605 }
1606
1607 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1608 }
1609
1610 static INLINE LLVMValueRef
1611 lp_build_round_arch(struct lp_build_context *bld,
1612 LLVMValueRef a,
1613 enum lp_build_round_mode mode)
1614 {
1615 if (util_cpu_caps.has_sse4_1)
1616 return lp_build_round_sse41(bld, a, mode);
1617 else /* (util_cpu_caps.has_altivec) */
1618 return lp_build_round_altivec(bld, a, mode);
1619 }
1620
1621 /**
1622 * Return the integer part of a float (vector) value (== round toward zero).
1623 * The returned value is a float (vector).
1624 * Ex: trunc(-1.5) = -1.0
1625 */
1626 LLVMValueRef
1627 lp_build_trunc(struct lp_build_context *bld,
1628 LLVMValueRef a)
1629 {
1630 LLVMBuilderRef builder = bld->gallivm->builder;
1631 const struct lp_type type = bld->type;
1632
1633 assert(type.floating);
1634 assert(lp_check_value(type, a));
1635
1636 if (arch_rounding_available(type)) {
1637 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1638 }
1639 else {
1640 const struct lp_type type = bld->type;
1641 struct lp_type inttype;
1642 struct lp_build_context intbld;
1643 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1644 LLVMValueRef trunc, res, anosign, mask;
1645 LLVMTypeRef int_vec_type = bld->int_vec_type;
1646 LLVMTypeRef vec_type = bld->vec_type;
1647
1648 assert(type.width == 32); /* might want to handle doubles at some point */
1649
1650 inttype = type;
1651 inttype.floating = 0;
1652 lp_build_context_init(&intbld, bld->gallivm, inttype);
1653
1654 /* round by truncation */
1655 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1656 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1657
1658 /* mask out sign bit */
1659 anosign = lp_build_abs(bld, a);
1660 /*
1661 * mask out all values if anosign > 2^24
1662 * This should work both for large ints (all rounding is no-op for them
1663 * because such floats are always exact) as well as special cases like
1664 * NaNs, Infs (taking advantage of the fact they use max exponent).
1665 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1666 */
1667 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1668 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1669 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1670 return lp_build_select(bld, mask, a, res);
1671 }
1672 }
1673
1674
1675 /**
1676 * Return float (vector) rounded to nearest integer (vector). The returned
1677 * value is a float (vector).
1678 * Ex: round(0.9) = 1.0
1679 * Ex: round(-1.5) = -2.0
1680 */
1681 LLVMValueRef
1682 lp_build_round(struct lp_build_context *bld,
1683 LLVMValueRef a)
1684 {
1685 LLVMBuilderRef builder = bld->gallivm->builder;
1686 const struct lp_type type = bld->type;
1687
1688 assert(type.floating);
1689 assert(lp_check_value(type, a));
1690
1691 if (arch_rounding_available(type)) {
1692 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1693 }
1694 else {
1695 const struct lp_type type = bld->type;
1696 struct lp_type inttype;
1697 struct lp_build_context intbld;
1698 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1699 LLVMValueRef res, anosign, mask;
1700 LLVMTypeRef int_vec_type = bld->int_vec_type;
1701 LLVMTypeRef vec_type = bld->vec_type;
1702
1703 assert(type.width == 32); /* might want to handle doubles at some point */
1704
1705 inttype = type;
1706 inttype.floating = 0;
1707 lp_build_context_init(&intbld, bld->gallivm, inttype);
1708
1709 res = lp_build_iround(bld, a);
1710 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1711
1712 /* mask out sign bit */
1713 anosign = lp_build_abs(bld, a);
1714 /*
1715 * mask out all values if anosign > 2^24
1716 * This should work both for large ints (all rounding is no-op for them
1717 * because such floats are always exact) as well as special cases like
1718 * NaNs, Infs (taking advantage of the fact they use max exponent).
1719 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1720 */
1721 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1722 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1723 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1724 return lp_build_select(bld, mask, a, res);
1725 }
1726 }
1727
1728
1729 /**
1730 * Return floor of float (vector), result is a float (vector)
1731 * Ex: floor(1.1) = 1.0
1732 * Ex: floor(-1.1) = -2.0
1733 */
1734 LLVMValueRef
1735 lp_build_floor(struct lp_build_context *bld,
1736 LLVMValueRef a)
1737 {
1738 LLVMBuilderRef builder = bld->gallivm->builder;
1739 const struct lp_type type = bld->type;
1740
1741 assert(type.floating);
1742 assert(lp_check_value(type, a));
1743
1744 if (arch_rounding_available(type)) {
1745 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1746 }
1747 else {
1748 const struct lp_type type = bld->type;
1749 struct lp_type inttype;
1750 struct lp_build_context intbld;
1751 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1752 LLVMValueRef trunc, res, anosign, mask;
1753 LLVMTypeRef int_vec_type = bld->int_vec_type;
1754 LLVMTypeRef vec_type = bld->vec_type;
1755
1756 assert(type.width == 32); /* might want to handle doubles at some point */
1757
1758 inttype = type;
1759 inttype.floating = 0;
1760 lp_build_context_init(&intbld, bld->gallivm, inttype);
1761
1762 /* round by truncation */
1763 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1764 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1765
1766 if (type.sign) {
1767 LLVMValueRef tmp;
1768
1769 /*
1770 * fix values if rounding is wrong (for non-special cases)
1771 * - this is the case if trunc > a
1772 */
1773 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1774 /* tmp = trunc > a ? 1.0 : 0.0 */
1775 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1776 tmp = lp_build_and(&intbld, mask, tmp);
1777 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1778 res = lp_build_sub(bld, res, tmp);
1779 }
1780
1781 /* mask out sign bit */
1782 anosign = lp_build_abs(bld, a);
1783 /*
1784 * mask out all values if anosign > 2^24
1785 * This should work both for large ints (all rounding is no-op for them
1786 * because such floats are always exact) as well as special cases like
1787 * NaNs, Infs (taking advantage of the fact they use max exponent).
1788 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1789 */
1790 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1791 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1792 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1793 return lp_build_select(bld, mask, a, res);
1794 }
1795 }
1796
1797
1798 /**
1799 * Return ceiling of float (vector), returning float (vector).
1800 * Ex: ceil( 1.1) = 2.0
1801 * Ex: ceil(-1.1) = -1.0
1802 */
1803 LLVMValueRef
1804 lp_build_ceil(struct lp_build_context *bld,
1805 LLVMValueRef a)
1806 {
1807 LLVMBuilderRef builder = bld->gallivm->builder;
1808 const struct lp_type type = bld->type;
1809
1810 assert(type.floating);
1811 assert(lp_check_value(type, a));
1812
1813 if (arch_rounding_available(type)) {
1814 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1815 }
1816 else {
1817 const struct lp_type type = bld->type;
1818 struct lp_type inttype;
1819 struct lp_build_context intbld;
1820 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1821 LLVMValueRef trunc, res, anosign, mask, tmp;
1822 LLVMTypeRef int_vec_type = bld->int_vec_type;
1823 LLVMTypeRef vec_type = bld->vec_type;
1824
1825 assert(type.width == 32); /* might want to handle doubles at some point */
1826
1827 inttype = type;
1828 inttype.floating = 0;
1829 lp_build_context_init(&intbld, bld->gallivm, inttype);
1830
1831 /* round by truncation */
1832 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1833 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
1834
1835 /*
1836 * fix values if rounding is wrong (for non-special cases)
1837 * - this is the case if trunc < a
1838 */
1839 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
1840 /* tmp = trunc < a ? 1.0 : 0.0 */
1841 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1842 tmp = lp_build_and(&intbld, mask, tmp);
1843 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1844 res = lp_build_add(bld, trunc, tmp);
1845
1846 /* mask out sign bit */
1847 anosign = lp_build_abs(bld, a);
1848 /*
1849 * mask out all values if anosign > 2^24
1850 * This should work both for large ints (all rounding is no-op for them
1851 * because such floats are always exact) as well as special cases like
1852 * NaNs, Infs (taking advantage of the fact they use max exponent).
1853 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1854 */
1855 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1856 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1857 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1858 return lp_build_select(bld, mask, a, res);
1859 }
1860 }
1861
1862
1863 /**
1864 * Return fractional part of 'a' computed as a - floor(a)
1865 * Typically used in texture coord arithmetic.
1866 */
1867 LLVMValueRef
1868 lp_build_fract(struct lp_build_context *bld,
1869 LLVMValueRef a)
1870 {
1871 assert(bld->type.floating);
1872 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1873 }
1874
1875
1876 /**
1877 * Prevent returning a fractional part of 1.0 for very small negative values of
1878 * 'a' by clamping against 0.99999(9).
1879 */
1880 static inline LLVMValueRef
1881 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1882 {
1883 LLVMValueRef max;
1884
1885 /* this is the largest number smaller than 1.0 representable as float */
1886 max = lp_build_const_vec(bld->gallivm, bld->type,
1887 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1888 return lp_build_min(bld, fract, max);
1889 }
1890
1891
1892 /**
1893 * Same as lp_build_fract, but guarantees that the result is always smaller
1894 * than one.
1895 */
1896 LLVMValueRef
1897 lp_build_fract_safe(struct lp_build_context *bld,
1898 LLVMValueRef a)
1899 {
1900 return clamp_fract(bld, lp_build_fract(bld, a));
1901 }
1902
1903
1904 /**
1905 * Return the integer part of a float (vector) value (== round toward zero).
1906 * The returned value is an integer (vector).
1907 * Ex: itrunc(-1.5) = -1
1908 */
1909 LLVMValueRef
1910 lp_build_itrunc(struct lp_build_context *bld,
1911 LLVMValueRef a)
1912 {
1913 LLVMBuilderRef builder = bld->gallivm->builder;
1914 const struct lp_type type = bld->type;
1915 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1916
1917 assert(type.floating);
1918 assert(lp_check_value(type, a));
1919
1920 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1921 }
1922
1923
1924 /**
1925 * Return float (vector) rounded to nearest integer (vector). The returned
1926 * value is an integer (vector).
1927 * Ex: iround(0.9) = 1
1928 * Ex: iround(-1.5) = -2
1929 */
1930 LLVMValueRef
1931 lp_build_iround(struct lp_build_context *bld,
1932 LLVMValueRef a)
1933 {
1934 LLVMBuilderRef builder = bld->gallivm->builder;
1935 const struct lp_type type = bld->type;
1936 LLVMTypeRef int_vec_type = bld->int_vec_type;
1937 LLVMValueRef res;
1938
1939 assert(type.floating);
1940
1941 assert(lp_check_value(type, a));
1942
1943 if ((util_cpu_caps.has_sse2 &&
1944 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1945 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1946 return lp_build_iround_nearest_sse2(bld, a);
1947 }
1948 if (arch_rounding_available(type)) {
1949 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1950 }
1951 else {
1952 LLVMValueRef half;
1953
1954 half = lp_build_const_vec(bld->gallivm, type, 0.5);
1955
1956 if (type.sign) {
1957 LLVMTypeRef vec_type = bld->vec_type;
1958 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1959 (unsigned long long)1 << (type.width - 1));
1960 LLVMValueRef sign;
1961
1962 /* get sign bit */
1963 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1964 sign = LLVMBuildAnd(builder, sign, mask, "");
1965
1966 /* sign * 0.5 */
1967 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1968 half = LLVMBuildOr(builder, sign, half, "");
1969 half = LLVMBuildBitCast(builder, half, vec_type, "");
1970 }
1971
1972 res = LLVMBuildFAdd(builder, a, half, "");
1973 }
1974
1975 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1976
1977 return res;
1978 }
1979
1980
1981 /**
1982 * Return floor of float (vector), result is an int (vector)
1983 * Ex: ifloor(1.1) = 1.0
1984 * Ex: ifloor(-1.1) = -2.0
1985 */
1986 LLVMValueRef
1987 lp_build_ifloor(struct lp_build_context *bld,
1988 LLVMValueRef a)
1989 {
1990 LLVMBuilderRef builder = bld->gallivm->builder;
1991 const struct lp_type type = bld->type;
1992 LLVMTypeRef int_vec_type = bld->int_vec_type;
1993 LLVMValueRef res;
1994
1995 assert(type.floating);
1996 assert(lp_check_value(type, a));
1997
1998 res = a;
1999 if (type.sign) {
2000 if (arch_rounding_available(type)) {
2001 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2002 }
2003 else {
2004 struct lp_type inttype;
2005 struct lp_build_context intbld;
2006 LLVMValueRef trunc, itrunc, mask;
2007
2008 assert(type.floating);
2009 assert(lp_check_value(type, a));
2010
2011 inttype = type;
2012 inttype.floating = 0;
2013 lp_build_context_init(&intbld, bld->gallivm, inttype);
2014
2015 /* round by truncation */
2016 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2017 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2018
2019 /*
2020 * fix values if rounding is wrong (for non-special cases)
2021 * - this is the case if trunc > a
2022 * The results of doing this with NaNs, very large values etc.
2023 * are undefined but this seems to be the case anyway.
2024 */
2025 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2026 /* cheapie minus one with mask since the mask is minus one / zero */
2027 return lp_build_add(&intbld, itrunc, mask);
2028 }
2029 }
2030
2031 /* round to nearest (toward zero) */
2032 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2033
2034 return res;
2035 }
2036
2037
2038 /**
2039 * Return ceiling of float (vector), returning int (vector).
2040 * Ex: iceil( 1.1) = 2
2041 * Ex: iceil(-1.1) = -1
2042 */
2043 LLVMValueRef
2044 lp_build_iceil(struct lp_build_context *bld,
2045 LLVMValueRef a)
2046 {
2047 LLVMBuilderRef builder = bld->gallivm->builder;
2048 const struct lp_type type = bld->type;
2049 LLVMTypeRef int_vec_type = bld->int_vec_type;
2050 LLVMValueRef res;
2051
2052 assert(type.floating);
2053 assert(lp_check_value(type, a));
2054
2055 if (arch_rounding_available(type)) {
2056 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2057 }
2058 else {
2059 struct lp_type inttype;
2060 struct lp_build_context intbld;
2061 LLVMValueRef trunc, itrunc, mask;
2062
2063 assert(type.floating);
2064 assert(lp_check_value(type, a));
2065
2066 inttype = type;
2067 inttype.floating = 0;
2068 lp_build_context_init(&intbld, bld->gallivm, inttype);
2069
2070 /* round by truncation */
2071 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2072 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2073
2074 /*
2075 * fix values if rounding is wrong (for non-special cases)
2076 * - this is the case if trunc < a
2077 * The results of doing this with NaNs, very large values etc.
2078 * are undefined but this seems to be the case anyway.
2079 */
2080 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2081 /* cheapie plus one with mask since the mask is minus one / zero */
2082 return lp_build_sub(&intbld, itrunc, mask);
2083 }
2084
2085 /* round to nearest (toward zero) */
2086 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2087
2088 return res;
2089 }
2090
2091
2092 /**
2093 * Combined ifloor() & fract().
2094 *
2095 * Preferred to calling the functions separately, as it will ensure that the
2096 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2097 */
2098 void
2099 lp_build_ifloor_fract(struct lp_build_context *bld,
2100 LLVMValueRef a,
2101 LLVMValueRef *out_ipart,
2102 LLVMValueRef *out_fpart)
2103 {
2104 LLVMBuilderRef builder = bld->gallivm->builder;
2105 const struct lp_type type = bld->type;
2106 LLVMValueRef ipart;
2107
2108 assert(type.floating);
2109 assert(lp_check_value(type, a));
2110
2111 if (arch_rounding_available(type)) {
2112 /*
2113 * floor() is easier.
2114 */
2115
2116 ipart = lp_build_floor(bld, a);
2117 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2118 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2119 }
2120 else {
2121 /*
2122 * ifloor() is easier.
2123 */
2124
2125 *out_ipart = lp_build_ifloor(bld, a);
2126 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2127 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2128 }
2129 }
2130
2131
2132 /**
2133 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2134 * always smaller than one.
2135 */
2136 void
2137 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2138 LLVMValueRef a,
2139 LLVMValueRef *out_ipart,
2140 LLVMValueRef *out_fpart)
2141 {
2142 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2143 *out_fpart = clamp_fract(bld, *out_fpart);
2144 }
2145
2146
2147 LLVMValueRef
2148 lp_build_sqrt(struct lp_build_context *bld,
2149 LLVMValueRef a)
2150 {
2151 LLVMBuilderRef builder = bld->gallivm->builder;
2152 const struct lp_type type = bld->type;
2153 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2154 char intrinsic[32];
2155
2156 assert(lp_check_value(type, a));
2157
2158 /* TODO: optimize the constant case */
2159
2160 assert(type.floating);
2161 if (type.length == 1) {
2162 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2163 }
2164 else {
2165 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2166 }
2167
2168 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2169 }
2170
2171
2172 /**
2173 * Do one Newton-Raphson step to improve reciprocate precision:
2174 *
2175 * x_{i+1} = x_i * (2 - a * x_i)
2176 *
2177 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2178 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2179 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2180 * halo. It would be necessary to clamp the argument to prevent this.
2181 *
2182 * See also:
2183 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2184 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2185 */
2186 static INLINE LLVMValueRef
2187 lp_build_rcp_refine(struct lp_build_context *bld,
2188 LLVMValueRef a,
2189 LLVMValueRef rcp_a)
2190 {
2191 LLVMBuilderRef builder = bld->gallivm->builder;
2192 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2193 LLVMValueRef res;
2194
2195 res = LLVMBuildFMul(builder, a, rcp_a, "");
2196 res = LLVMBuildFSub(builder, two, res, "");
2197 res = LLVMBuildFMul(builder, rcp_a, res, "");
2198
2199 return res;
2200 }
2201
2202
2203 LLVMValueRef
2204 lp_build_rcp(struct lp_build_context *bld,
2205 LLVMValueRef a)
2206 {
2207 LLVMBuilderRef builder = bld->gallivm->builder;
2208 const struct lp_type type = bld->type;
2209
2210 assert(lp_check_value(type, a));
2211
2212 if(a == bld->zero)
2213 return bld->undef;
2214 if(a == bld->one)
2215 return bld->one;
2216 if(a == bld->undef)
2217 return bld->undef;
2218
2219 assert(type.floating);
2220
2221 if(LLVMIsConstant(a))
2222 return LLVMConstFDiv(bld->one, a);
2223
2224 /*
2225 * We don't use RCPPS because:
2226 * - it only has 10bits of precision
2227 * - it doesn't even get the reciprocate of 1.0 exactly
2228 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2229 * - for recent processors the benefit over DIVPS is marginal, a case
2230 * dependent
2231 *
2232 * We could still use it on certain processors if benchmarks show that the
2233 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2234 * particular uses that require less workarounds.
2235 */
2236
2237 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2238 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2239 const unsigned num_iterations = 0;
2240 LLVMValueRef res;
2241 unsigned i;
2242 const char *intrinsic = NULL;
2243
2244 if (type.length == 4) {
2245 intrinsic = "llvm.x86.sse.rcp.ps";
2246 }
2247 else {
2248 intrinsic = "llvm.x86.avx.rcp.ps.256";
2249 }
2250
2251 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2252
2253 for (i = 0; i < num_iterations; ++i) {
2254 res = lp_build_rcp_refine(bld, a, res);
2255 }
2256
2257 return res;
2258 }
2259
2260 return LLVMBuildFDiv(builder, bld->one, a, "");
2261 }
2262
2263
2264 /**
2265 * Do one Newton-Raphson step to improve rsqrt precision:
2266 *
2267 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2268 *
2269 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2270 */
2271 static INLINE LLVMValueRef
2272 lp_build_rsqrt_refine(struct lp_build_context *bld,
2273 LLVMValueRef a,
2274 LLVMValueRef rsqrt_a)
2275 {
2276 LLVMBuilderRef builder = bld->gallivm->builder;
2277 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2278 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2279 LLVMValueRef res;
2280
2281 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2282 res = LLVMBuildFMul(builder, a, res, "");
2283 res = LLVMBuildFSub(builder, three, res, "");
2284 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2285 res = LLVMBuildFMul(builder, half, res, "");
2286
2287 return res;
2288 }
2289
2290
2291 /**
2292 * Generate 1/sqrt(a).
2293 * Result is undefined for values < 0, infinity for +0.
2294 */
2295 LLVMValueRef
2296 lp_build_rsqrt(struct lp_build_context *bld,
2297 LLVMValueRef a)
2298 {
2299 LLVMBuilderRef builder = bld->gallivm->builder;
2300 const struct lp_type type = bld->type;
2301
2302 assert(lp_check_value(type, a));
2303
2304 assert(type.floating);
2305
2306 /*
2307 * This should be faster but all denormals will end up as infinity.
2308 */
2309 if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2310 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
2311 const unsigned num_iterations = 1;
2312 LLVMValueRef res;
2313 unsigned i;
2314 const char *intrinsic = NULL;
2315
2316 if (type.length == 4) {
2317 intrinsic = "llvm.x86.sse.rsqrt.ps";
2318 }
2319 else {
2320 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2321 }
2322 if (num_iterations) {
2323 /*
2324 * Newton-Raphson will result in NaN instead of infinity for zero,
2325 * and NaN instead of zero for infinity.
2326 * Also, need to ensure rsqrt(1.0) == 1.0.
2327 * All numbers smaller than FLT_MIN will result in +infinity
2328 * (rsqrtps treats all denormals as zero).
2329 */
2330 /*
2331 * Certain non-c99 compilers don't know INFINITY and might not support
2332 * hacks to evaluate it at compile time neither.
2333 */
2334 const unsigned posinf_int = 0x7F800000;
2335 LLVMValueRef cmp;
2336 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2337 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2338
2339 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2340
2341 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2342
2343 for (i = 0; i < num_iterations; ++i) {
2344 res = lp_build_rsqrt_refine(bld, a, res);
2345 }
2346 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2347 res = lp_build_select(bld, cmp, inf, res);
2348 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2349 res = lp_build_select(bld, cmp, bld->zero, res);
2350 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2351 res = lp_build_select(bld, cmp, bld->one, res);
2352 }
2353 else {
2354 /* rsqrt(1.0) != 1.0 here */
2355 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2356
2357 }
2358
2359 return res;
2360 }
2361
2362 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2363 }
2364
2365
2366 /**
2367 * Generate sin(a) using SSE2
2368 */
2369 LLVMValueRef
2370 lp_build_sin(struct lp_build_context *bld,
2371 LLVMValueRef a)
2372 {
2373 struct gallivm_state *gallivm = bld->gallivm;
2374 LLVMBuilderRef builder = gallivm->builder;
2375 struct lp_type int_type = lp_int_type(bld->type);
2376 LLVMBuilderRef b = builder;
2377
2378 /*
2379 * take the absolute value,
2380 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2381 */
2382
2383 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2384 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2385
2386 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2387 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2388
2389 /*
2390 * extract the sign bit (upper one)
2391 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2392 */
2393 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2394 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2395
2396 /*
2397 * scale by 4/Pi
2398 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2399 */
2400
2401 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2402 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2403
2404 /*
2405 * store the integer part of y in mm0
2406 * emm2 = _mm_cvttps_epi32(y);
2407 */
2408
2409 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2410
2411 /*
2412 * j=(j+1) & (~1) (see the cephes sources)
2413 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2414 */
2415
2416 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2417 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2418 /*
2419 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2420 */
2421 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2422 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2423
2424 /*
2425 * y = _mm_cvtepi32_ps(emm2);
2426 */
2427 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2428
2429 /* get the swap sign flag
2430 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2431 */
2432 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2433 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2434
2435 /*
2436 * emm2 = _mm_slli_epi32(emm0, 29);
2437 */
2438 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2439 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2440
2441 /*
2442 * get the polynom selection mask
2443 * there is one polynom for 0 <= x <= Pi/4
2444 * and another one for Pi/4<x<=Pi/2
2445 * Both branches will be computed.
2446 *
2447 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2448 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2449 */
2450
2451 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2452 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2453 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2454 int_type, PIPE_FUNC_EQUAL,
2455 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2456 /*
2457 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2458 */
2459 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2460
2461 /*
2462 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2463 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2464 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2465 */
2466 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2467 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2468 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2469
2470 /*
2471 * The magic pass: "Extended precision modular arithmetic"
2472 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2473 * xmm1 = _mm_mul_ps(y, xmm1);
2474 * xmm2 = _mm_mul_ps(y, xmm2);
2475 * xmm3 = _mm_mul_ps(y, xmm3);
2476 */
2477 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2478 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2479 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2480
2481 /*
2482 * x = _mm_add_ps(x, xmm1);
2483 * x = _mm_add_ps(x, xmm2);
2484 * x = _mm_add_ps(x, xmm3);
2485 */
2486
2487 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2488 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2489 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2490
2491 /*
2492 * Evaluate the first polynom (0 <= x <= Pi/4)
2493 *
2494 * z = _mm_mul_ps(x,x);
2495 */
2496 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2497
2498 /*
2499 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2500 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2501 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2502 */
2503 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2504 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2505 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2506
2507 /*
2508 * y = *(v4sf*)_ps_coscof_p0;
2509 * y = _mm_mul_ps(y, z);
2510 */
2511 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2512 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2513 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2514 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2515 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2516 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2517
2518
2519 /*
2520 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2521 * y = _mm_sub_ps(y, tmp);
2522 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2523 */
2524 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2525 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2526 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2527 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2528 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2529
2530 /*
2531 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2532 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2533 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2534 */
2535 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2536 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2537 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2538
2539 /*
2540 * Evaluate the second polynom (Pi/4 <= x <= 0)
2541 *
2542 * y2 = *(v4sf*)_ps_sincof_p0;
2543 * y2 = _mm_mul_ps(y2, z);
2544 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2545 * y2 = _mm_mul_ps(y2, z);
2546 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2547 * y2 = _mm_mul_ps(y2, z);
2548 * y2 = _mm_mul_ps(y2, x);
2549 * y2 = _mm_add_ps(y2, x);
2550 */
2551
2552 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2553 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2554 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2555 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2556 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2557 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2558 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2559
2560 /*
2561 * select the correct result from the two polynoms
2562 * xmm3 = poly_mask;
2563 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2564 * y = _mm_andnot_ps(xmm3, y);
2565 * y = _mm_add_ps(y,y2);
2566 */
2567 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2568 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2569 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2570 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2571 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2572 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2573 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2574
2575 /*
2576 * update the sign
2577 * y = _mm_xor_ps(y, sign_bit);
2578 */
2579 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2580 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2581 return y_result;
2582 }
2583
2584
2585 /**
2586 * Generate cos(a) using SSE2
2587 */
2588 LLVMValueRef
2589 lp_build_cos(struct lp_build_context *bld,
2590 LLVMValueRef a)
2591 {
2592 struct gallivm_state *gallivm = bld->gallivm;
2593 LLVMBuilderRef builder = gallivm->builder;
2594 struct lp_type int_type = lp_int_type(bld->type);
2595 LLVMBuilderRef b = builder;
2596
2597 /*
2598 * take the absolute value,
2599 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2600 */
2601
2602 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2603 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2604
2605 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2606 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2607
2608 /*
2609 * scale by 4/Pi
2610 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2611 */
2612
2613 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2614 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2615
2616 /*
2617 * store the integer part of y in mm0
2618 * emm2 = _mm_cvttps_epi32(y);
2619 */
2620
2621 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2622
2623 /*
2624 * j=(j+1) & (~1) (see the cephes sources)
2625 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2626 */
2627
2628 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2629 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2630 /*
2631 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2632 */
2633 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2634 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2635
2636 /*
2637 * y = _mm_cvtepi32_ps(emm2);
2638 */
2639 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2640
2641
2642 /*
2643 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2644 */
2645 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2646 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2647
2648
2649 /* get the swap sign flag
2650 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2651 */
2652 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2653 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2654 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2655 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2656
2657 /*
2658 * emm2 = _mm_slli_epi32(emm0, 29);
2659 */
2660 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2661 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2662
2663 /*
2664 * get the polynom selection mask
2665 * there is one polynom for 0 <= x <= Pi/4
2666 * and another one for Pi/4<x<=Pi/2
2667 * Both branches will be computed.
2668 *
2669 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2670 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2671 */
2672
2673 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2674 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2675 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2676 int_type, PIPE_FUNC_EQUAL,
2677 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2678
2679 /*
2680 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2681 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2682 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2683 */
2684 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2685 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2686 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2687
2688 /*
2689 * The magic pass: "Extended precision modular arithmetic"
2690 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2691 * xmm1 = _mm_mul_ps(y, xmm1);
2692 * xmm2 = _mm_mul_ps(y, xmm2);
2693 * xmm3 = _mm_mul_ps(y, xmm3);
2694 */
2695 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2696 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2697 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2698
2699 /*
2700 * x = _mm_add_ps(x, xmm1);
2701 * x = _mm_add_ps(x, xmm2);
2702 * x = _mm_add_ps(x, xmm3);
2703 */
2704
2705 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2706 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2707 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2708
2709 /*
2710 * Evaluate the first polynom (0 <= x <= Pi/4)
2711 *
2712 * z = _mm_mul_ps(x,x);
2713 */
2714 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2715
2716 /*
2717 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2718 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2719 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2720 */
2721 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2722 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2723 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2724
2725 /*
2726 * y = *(v4sf*)_ps_coscof_p0;
2727 * y = _mm_mul_ps(y, z);
2728 */
2729 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2730 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2731 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2732 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2733 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2734 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2735
2736
2737 /*
2738 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2739 * y = _mm_sub_ps(y, tmp);
2740 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2741 */
2742 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2743 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2744 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2745 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2746 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2747
2748 /*
2749 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2750 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2751 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2752 */
2753 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2754 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2755 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2756
2757 /*
2758 * Evaluate the second polynom (Pi/4 <= x <= 0)
2759 *
2760 * y2 = *(v4sf*)_ps_sincof_p0;
2761 * y2 = _mm_mul_ps(y2, z);
2762 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2763 * y2 = _mm_mul_ps(y2, z);
2764 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2765 * y2 = _mm_mul_ps(y2, z);
2766 * y2 = _mm_mul_ps(y2, x);
2767 * y2 = _mm_add_ps(y2, x);
2768 */
2769
2770 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2771 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2772 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2773 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2774 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2775 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2776 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2777
2778 /*
2779 * select the correct result from the two polynoms
2780 * xmm3 = poly_mask;
2781 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2782 * y = _mm_andnot_ps(xmm3, y);
2783 * y = _mm_add_ps(y,y2);
2784 */
2785 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2786 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2787 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2788 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2789 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2790 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2791
2792 /*
2793 * update the sign
2794 * y = _mm_xor_ps(y, sign_bit);
2795 */
2796 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2797 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2798 return y_result;
2799 }
2800
2801
2802 /**
2803 * Generate pow(x, y)
2804 */
2805 LLVMValueRef
2806 lp_build_pow(struct lp_build_context *bld,
2807 LLVMValueRef x,
2808 LLVMValueRef y)
2809 {
2810 /* TODO: optimize the constant case */
2811 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2812 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2813 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2814 __FUNCTION__);
2815 }
2816
2817 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2818 }
2819
2820
2821 /**
2822 * Generate exp(x)
2823 */
2824 LLVMValueRef
2825 lp_build_exp(struct lp_build_context *bld,
2826 LLVMValueRef x)
2827 {
2828 /* log2(e) = 1/log(2) */
2829 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2830 1.4426950408889634);
2831
2832 assert(lp_check_value(bld->type, x));
2833
2834 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2835 }
2836
2837
2838 /**
2839 * Generate log(x)
2840 */
2841 LLVMValueRef
2842 lp_build_log(struct lp_build_context *bld,
2843 LLVMValueRef x)
2844 {
2845 /* log(2) */
2846 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2847 0.69314718055994529);
2848
2849 assert(lp_check_value(bld->type, x));
2850
2851 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2852 }
2853
2854
2855 /**
2856 * Generate polynomial.
2857 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2858 */
2859 static LLVMValueRef
2860 lp_build_polynomial(struct lp_build_context *bld,
2861 LLVMValueRef x,
2862 const double *coeffs,
2863 unsigned num_coeffs)
2864 {
2865 const struct lp_type type = bld->type;
2866 LLVMValueRef even = NULL, odd = NULL;
2867 LLVMValueRef x2;
2868 unsigned i;
2869
2870 assert(lp_check_value(bld->type, x));
2871
2872 /* TODO: optimize the constant case */
2873 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2874 LLVMIsConstant(x)) {
2875 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2876 __FUNCTION__);
2877 }
2878
2879 /*
2880 * Calculate odd and even terms seperately to decrease data dependency
2881 * Ex:
2882 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2883 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2884 */
2885 x2 = lp_build_mul(bld, x, x);
2886
2887 for (i = num_coeffs; i--; ) {
2888 LLVMValueRef coeff;
2889
2890 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2891
2892 if (i % 2 == 0) {
2893 if (even)
2894 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2895 else
2896 even = coeff;
2897 } else {
2898 if (odd)
2899 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2900 else
2901 odd = coeff;
2902 }
2903 }
2904
2905 if (odd)
2906 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2907 else if (even)
2908 return even;
2909 else
2910 return bld->undef;
2911 }
2912
2913
2914 /**
2915 * Minimax polynomial fit of 2**x, in range [0, 1[
2916 */
2917 const double lp_build_exp2_polynomial[] = {
2918 #if EXP_POLY_DEGREE == 5
2919 0.999999925063526176901,
2920 0.693153073200168932794,
2921 0.240153617044375388211,
2922 0.0558263180532956664775,
2923 0.00898934009049466391101,
2924 0.00187757667519147912699
2925 #elif EXP_POLY_DEGREE == 4
2926 1.00000259337069434683,
2927 0.693003834469974940458,
2928 0.24144275689150793076,
2929 0.0520114606103070150235,
2930 0.0135341679161270268764
2931 #elif EXP_POLY_DEGREE == 3
2932 0.999925218562710312959,
2933 0.695833540494823811697,
2934 0.226067155427249155588,
2935 0.0780245226406372992967
2936 #elif EXP_POLY_DEGREE == 2
2937 1.00172476321474503578,
2938 0.657636275736077639316,
2939 0.33718943461968720704
2940 #else
2941 #error
2942 #endif
2943 };
2944
2945
2946 void
2947 lp_build_exp2_approx(struct lp_build_context *bld,
2948 LLVMValueRef x,
2949 LLVMValueRef *p_exp2_int_part,
2950 LLVMValueRef *p_frac_part,
2951 LLVMValueRef *p_exp2)
2952 {
2953 LLVMBuilderRef builder = bld->gallivm->builder;
2954 const struct lp_type type = bld->type;
2955 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2956 LLVMValueRef ipart = NULL;
2957 LLVMValueRef fpart = NULL;
2958 LLVMValueRef expipart = NULL;
2959 LLVMValueRef expfpart = NULL;
2960 LLVMValueRef res = NULL;
2961
2962 assert(lp_check_value(bld->type, x));
2963
2964 if(p_exp2_int_part || p_frac_part || p_exp2) {
2965 /* TODO: optimize the constant case */
2966 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2967 LLVMIsConstant(x)) {
2968 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2969 __FUNCTION__);
2970 }
2971
2972 assert(type.floating && type.width == 32);
2973
2974 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0));
2975 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2976
2977 /* ipart = floor(x) */
2978 /* fpart = x - ipart */
2979 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2980 }
2981
2982 if(p_exp2_int_part || p_exp2) {
2983 /* expipart = (float) (1 << ipart) */
2984 expipart = LLVMBuildAdd(builder, ipart,
2985 lp_build_const_int_vec(bld->gallivm, type, 127), "");
2986 expipart = LLVMBuildShl(builder, expipart,
2987 lp_build_const_int_vec(bld->gallivm, type, 23), "");
2988 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2989 }
2990
2991 if(p_exp2) {
2992 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2993 Elements(lp_build_exp2_polynomial));
2994
2995 res = LLVMBuildFMul(builder, expipart, expfpart, "");
2996 }
2997
2998 if(p_exp2_int_part)
2999 *p_exp2_int_part = expipart;
3000
3001 if(p_frac_part)
3002 *p_frac_part = fpart;
3003
3004 if(p_exp2)
3005 *p_exp2 = res;
3006 }
3007
3008
3009 LLVMValueRef
3010 lp_build_exp2(struct lp_build_context *bld,
3011 LLVMValueRef x)
3012 {
3013 LLVMValueRef res;
3014 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3015 return res;
3016 }
3017
3018
3019 /**
3020 * Extract the exponent of a IEEE-754 floating point value.
3021 *
3022 * Optionally apply an integer bias.
3023 *
3024 * Result is an integer value with
3025 *
3026 * ifloor(log2(x)) + bias
3027 */
3028 LLVMValueRef
3029 lp_build_extract_exponent(struct lp_build_context *bld,
3030 LLVMValueRef x,
3031 int bias)
3032 {
3033 LLVMBuilderRef builder = bld->gallivm->builder;
3034 const struct lp_type type = bld->type;
3035 unsigned mantissa = lp_mantissa(type);
3036 LLVMValueRef res;
3037
3038 assert(type.floating);
3039
3040 assert(lp_check_value(bld->type, x));
3041
3042 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3043
3044 res = LLVMBuildLShr(builder, x,
3045 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3046 res = LLVMBuildAnd(builder, res,
3047 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3048 res = LLVMBuildSub(builder, res,
3049 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3050
3051 return res;
3052 }
3053
3054
3055 /**
3056 * Extract the mantissa of the a floating.
3057 *
3058 * Result is a floating point value with
3059 *
3060 * x / floor(log2(x))
3061 */
3062 LLVMValueRef
3063 lp_build_extract_mantissa(struct lp_build_context *bld,
3064 LLVMValueRef x)
3065 {
3066 LLVMBuilderRef builder = bld->gallivm->builder;
3067 const struct lp_type type = bld->type;
3068 unsigned mantissa = lp_mantissa(type);
3069 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3070 (1ULL << mantissa) - 1);
3071 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3072 LLVMValueRef res;
3073
3074 assert(lp_check_value(bld->type, x));
3075
3076 assert(type.floating);
3077
3078 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3079
3080 /* res = x / 2**ipart */
3081 res = LLVMBuildAnd(builder, x, mantmask, "");
3082 res = LLVMBuildOr(builder, res, one, "");
3083 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3084
3085 return res;
3086 }
3087
3088
3089
3090 /**
3091 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3092 * These coefficients can be generate with
3093 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3094 */
3095 const double lp_build_log2_polynomial[] = {
3096 #if LOG_POLY_DEGREE == 5
3097 2.88539008148777786488L,
3098 0.961796878841293367824L,
3099 0.577058946784739859012L,
3100 0.412914355135828735411L,
3101 0.308591899232910175289L,
3102 0.352376952300281371868L,
3103 #elif LOG_POLY_DEGREE == 4
3104 2.88539009343309178325L,
3105 0.961791550404184197881L,
3106 0.577440339438736392009L,
3107 0.403343858251329912514L,
3108 0.406718052498846252698L,
3109 #elif LOG_POLY_DEGREE == 3
3110 2.88538959748872753838L,
3111 0.961932915889597772928L,
3112 0.571118517972136195241L,
3113 0.493997535084709500285L,
3114 #else
3115 #error
3116 #endif
3117 };
3118
3119 /**
3120 * See http://www.devmaster.net/forums/showthread.php?p=43580
3121 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3122 * http://www.nezumi.demon.co.uk/consult/logx.htm
3123 */
3124 void
3125 lp_build_log2_approx(struct lp_build_context *bld,
3126 LLVMValueRef x,
3127 LLVMValueRef *p_exp,
3128 LLVMValueRef *p_floor_log2,
3129 LLVMValueRef *p_log2)
3130 {
3131 LLVMBuilderRef builder = bld->gallivm->builder;
3132 const struct lp_type type = bld->type;
3133 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3134 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3135
3136 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3137 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3138 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3139
3140 LLVMValueRef i = NULL;
3141 LLVMValueRef y = NULL;
3142 LLVMValueRef z = NULL;
3143 LLVMValueRef exp = NULL;
3144 LLVMValueRef mant = NULL;
3145 LLVMValueRef logexp = NULL;
3146 LLVMValueRef logmant = NULL;
3147 LLVMValueRef res = NULL;
3148
3149 assert(lp_check_value(bld->type, x));
3150
3151 if(p_exp || p_floor_log2 || p_log2) {
3152 /* TODO: optimize the constant case */
3153 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3154 LLVMIsConstant(x)) {
3155 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3156 __FUNCTION__);
3157 }
3158
3159 assert(type.floating && type.width == 32);
3160
3161 /*
3162 * We don't explicitly handle denormalized numbers. They will yield a
3163 * result in the neighbourhood of -127, which appears to be adequate
3164 * enough.
3165 */
3166
3167 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3168
3169 /* exp = (float) exponent(x) */
3170 exp = LLVMBuildAnd(builder, i, expmask, "");
3171 }
3172
3173 if(p_floor_log2 || p_log2) {
3174 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3175 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3176 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3177 }
3178
3179 if(p_log2) {
3180 /* mant = 1 + (float) mantissa(x) */
3181 mant = LLVMBuildAnd(builder, i, mantmask, "");
3182 mant = LLVMBuildOr(builder, mant, one, "");
3183 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3184
3185 /* y = (mant - 1) / (mant + 1) */
3186 y = lp_build_div(bld,
3187 lp_build_sub(bld, mant, bld->one),
3188 lp_build_add(bld, mant, bld->one)
3189 );
3190
3191 /* z = y^2 */
3192 z = lp_build_mul(bld, y, y);
3193
3194 /* compute P(z) */
3195 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3196 Elements(lp_build_log2_polynomial));
3197
3198 /* logmant = y * P(z) */
3199 logmant = lp_build_mul(bld, y, logmant);
3200
3201 res = lp_build_add(bld, logmant, logexp);
3202 }
3203
3204 if(p_exp) {
3205 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3206 *p_exp = exp;
3207 }
3208
3209 if(p_floor_log2)
3210 *p_floor_log2 = logexp;
3211
3212 if(p_log2)
3213 *p_log2 = res;
3214 }
3215
3216
3217 LLVMValueRef
3218 lp_build_log2(struct lp_build_context *bld,
3219 LLVMValueRef x)
3220 {
3221 LLVMValueRef res;
3222 lp_build_log2_approx(bld, x, NULL, NULL, &res);
3223 return res;
3224 }
3225
3226
3227 /**
3228 * Faster (and less accurate) log2.
3229 *
3230 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3231 *
3232 * Piece-wise linear approximation, with exact results when x is a
3233 * power of two.
3234 *
3235 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3236 */
3237 LLVMValueRef
3238 lp_build_fast_log2(struct lp_build_context *bld,
3239 LLVMValueRef x)
3240 {
3241 LLVMBuilderRef builder = bld->gallivm->builder;
3242 LLVMValueRef ipart;
3243 LLVMValueRef fpart;
3244
3245 assert(lp_check_value(bld->type, x));
3246
3247 assert(bld->type.floating);
3248
3249 /* ipart = floor(log2(x)) - 1 */
3250 ipart = lp_build_extract_exponent(bld, x, -1);
3251 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3252
3253 /* fpart = x / 2**ipart */
3254 fpart = lp_build_extract_mantissa(bld, x);
3255
3256 /* ipart + fpart */
3257 return LLVMBuildFAdd(builder, ipart, fpart, "");
3258 }
3259
3260
3261 /**
3262 * Fast implementation of iround(log2(x)).
3263 *
3264 * Not an approximation -- it should give accurate results all the time.
3265 */
3266 LLVMValueRef
3267 lp_build_ilog2(struct lp_build_context *bld,
3268 LLVMValueRef x)
3269 {
3270 LLVMBuilderRef builder = bld->gallivm->builder;
3271 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3272 LLVMValueRef ipart;
3273
3274 assert(bld->type.floating);
3275
3276 assert(lp_check_value(bld->type, x));
3277
3278 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3279 x = LLVMBuildFMul(builder, x, sqrt2, "");
3280
3281 /* ipart = floor(log2(x) + 0.5) */
3282 ipart = lp_build_extract_exponent(bld, x, 0);
3283
3284 return ipart;
3285 }
3286
3287 LLVMValueRef
3288 lp_build_mod(struct lp_build_context *bld,
3289 LLVMValueRef x,
3290 LLVMValueRef y)
3291 {
3292 LLVMBuilderRef builder = bld->gallivm->builder;
3293 LLVMValueRef res;
3294 const struct lp_type type = bld->type;
3295
3296 assert(lp_check_value(type, x));
3297 assert(lp_check_value(type, y));
3298
3299 if (type.floating)
3300 res = LLVMBuildFRem(builder, x, y, "");
3301 else if (type.sign)
3302 res = LLVMBuildSRem(builder, x, y, "");
3303 else
3304 res = LLVMBuildURem(builder, x, y, "");
3305 return res;
3306 }