gallivm: (trivial) simplify lp_build_cos/lp_build_sin a tiny bit
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65 #include "lp_bld_flow.h"
66
67
68 #define EXP_POLY_DEGREE 5
69
70 #define LOG_POLY_DEGREE 4
71
72
73 /**
74 * Generate min(a, b)
75 * No checks for special case values of a or b = 1 or 0 are done.
76 */
77 static LLVMValueRef
78 lp_build_min_simple(struct lp_build_context *bld,
79 LLVMValueRef a,
80 LLVMValueRef b)
81 {
82 const struct lp_type type = bld->type;
83 const char *intrinsic = NULL;
84 unsigned intr_size = 0;
85 LLVMValueRef cond;
86
87 assert(lp_check_value(type, a));
88 assert(lp_check_value(type, b));
89
90 /* TODO: optimize the constant case */
91
92 if (type.floating && util_cpu_caps.has_sse) {
93 if (type.width == 32) {
94 if (type.length == 1) {
95 intrinsic = "llvm.x86.sse.min.ss";
96 intr_size = 128;
97 }
98 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
99 intrinsic = "llvm.x86.sse.min.ps";
100 intr_size = 128;
101 }
102 else {
103 intrinsic = "llvm.x86.avx.min.ps.256";
104 intr_size = 256;
105 }
106 }
107 if (type.width == 64 && util_cpu_caps.has_sse2) {
108 if (type.length == 1) {
109 intrinsic = "llvm.x86.sse2.min.sd";
110 intr_size = 128;
111 }
112 else if (type.length == 2 || !util_cpu_caps.has_avx) {
113 intrinsic = "llvm.x86.sse2.min.pd";
114 intr_size = 128;
115 }
116 else {
117 intrinsic = "llvm.x86.avx.min.pd.256";
118 intr_size = 256;
119 }
120 }
121 }
122 else if (type.floating && util_cpu_caps.has_altivec) {
123 if (type.width == 32 && type.length == 4) {
124 intrinsic = "llvm.ppc.altivec.vminfp";
125 intr_size = 128;
126 }
127 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
128 intr_size = 128;
129 if ((type.width == 8 || type.width == 16) &&
130 (type.width * type.length <= 64) &&
131 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
132 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
133 __FUNCTION__);
134 }
135 if (type.width == 8 && !type.sign) {
136 intrinsic = "llvm.x86.sse2.pminu.b";
137 }
138 else if (type.width == 16 && type.sign) {
139 intrinsic = "llvm.x86.sse2.pmins.w";
140 }
141 if (util_cpu_caps.has_sse4_1) {
142 if (type.width == 8 && type.sign) {
143 intrinsic = "llvm.x86.sse41.pminsb";
144 }
145 if (type.width == 16 && !type.sign) {
146 intrinsic = "llvm.x86.sse41.pminuw";
147 }
148 if (type.width == 32 && !type.sign) {
149 intrinsic = "llvm.x86.sse41.pminud";
150 }
151 if (type.width == 32 && type.sign) {
152 intrinsic = "llvm.x86.sse41.pminsd";
153 }
154 }
155 } else if (util_cpu_caps.has_altivec) {
156 intr_size = 128;
157 if (type.width == 8) {
158 if (!type.sign) {
159 intrinsic = "llvm.ppc.altivec.vminub";
160 } else {
161 intrinsic = "llvm.ppc.altivec.vminsb";
162 }
163 } else if (type.width == 16) {
164 if (!type.sign) {
165 intrinsic = "llvm.ppc.altivec.vminuh";
166 } else {
167 intrinsic = "llvm.ppc.altivec.vminsh";
168 }
169 } else if (type.width == 32) {
170 if (!type.sign) {
171 intrinsic = "llvm.ppc.altivec.vminuw";
172 } else {
173 intrinsic = "llvm.ppc.altivec.vminsw";
174 }
175 }
176 }
177
178 if(intrinsic) {
179 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180 type,
181 intr_size, a, b);
182 }
183
184 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
185 return lp_build_select(bld, cond, a, b);
186 }
187
188
189 /**
190 * Generate max(a, b)
191 * No checks for special case values of a or b = 1 or 0 are done.
192 */
193 static LLVMValueRef
194 lp_build_max_simple(struct lp_build_context *bld,
195 LLVMValueRef a,
196 LLVMValueRef b)
197 {
198 const struct lp_type type = bld->type;
199 const char *intrinsic = NULL;
200 unsigned intr_size = 0;
201 LLVMValueRef cond;
202
203 assert(lp_check_value(type, a));
204 assert(lp_check_value(type, b));
205
206 /* TODO: optimize the constant case */
207
208 if (type.floating && util_cpu_caps.has_sse) {
209 if (type.width == 32) {
210 if (type.length == 1) {
211 intrinsic = "llvm.x86.sse.max.ss";
212 intr_size = 128;
213 }
214 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
215 intrinsic = "llvm.x86.sse.max.ps";
216 intr_size = 128;
217 }
218 else {
219 intrinsic = "llvm.x86.avx.max.ps.256";
220 intr_size = 256;
221 }
222 }
223 if (type.width == 64 && util_cpu_caps.has_sse2) {
224 if (type.length == 1) {
225 intrinsic = "llvm.x86.sse2.max.sd";
226 intr_size = 128;
227 }
228 else if (type.length == 2 || !util_cpu_caps.has_avx) {
229 intrinsic = "llvm.x86.sse2.max.pd";
230 intr_size = 128;
231 }
232 else {
233 intrinsic = "llvm.x86.avx.max.pd.256";
234 intr_size = 256;
235 }
236 }
237 }
238 else if (type.floating && util_cpu_caps.has_altivec) {
239 if (type.width == 32 || type.length == 4) {
240 intrinsic = "llvm.ppc.altivec.vmaxfp";
241 intr_size = 128;
242 }
243 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
244 intr_size = 128;
245 if ((type.width == 8 || type.width == 16) &&
246 (type.width * type.length <= 64) &&
247 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
248 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
249 __FUNCTION__);
250 }
251 if (type.width == 8 && !type.sign) {
252 intrinsic = "llvm.x86.sse2.pmaxu.b";
253 intr_size = 128;
254 }
255 else if (type.width == 16 && type.sign) {
256 intrinsic = "llvm.x86.sse2.pmaxs.w";
257 }
258 if (util_cpu_caps.has_sse4_1) {
259 if (type.width == 8 && type.sign) {
260 intrinsic = "llvm.x86.sse41.pmaxsb";
261 }
262 if (type.width == 16 && !type.sign) {
263 intrinsic = "llvm.x86.sse41.pmaxuw";
264 }
265 if (type.width == 32 && !type.sign) {
266 intrinsic = "llvm.x86.sse41.pmaxud";
267 }
268 if (type.width == 32 && type.sign) {
269 intrinsic = "llvm.x86.sse41.pmaxsd";
270 }
271 }
272 } else if (util_cpu_caps.has_altivec) {
273 intr_size = 128;
274 if (type.width == 8) {
275 if (!type.sign) {
276 intrinsic = "llvm.ppc.altivec.vmaxub";
277 } else {
278 intrinsic = "llvm.ppc.altivec.vmaxsb";
279 }
280 } else if (type.width == 16) {
281 if (!type.sign) {
282 intrinsic = "llvm.ppc.altivec.vmaxuh";
283 } else {
284 intrinsic = "llvm.ppc.altivec.vmaxsh";
285 }
286 } else if (type.width == 32) {
287 if (!type.sign) {
288 intrinsic = "llvm.ppc.altivec.vmaxuw";
289 } else {
290 intrinsic = "llvm.ppc.altivec.vmaxsw";
291 }
292 }
293 }
294
295 if(intrinsic) {
296 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
297 type,
298 intr_size, a, b);
299 }
300
301 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
302 return lp_build_select(bld, cond, a, b);
303 }
304
305
306 /**
307 * Generate 1 - a, or ~a depending on bld->type.
308 */
309 LLVMValueRef
310 lp_build_comp(struct lp_build_context *bld,
311 LLVMValueRef a)
312 {
313 LLVMBuilderRef builder = bld->gallivm->builder;
314 const struct lp_type type = bld->type;
315
316 assert(lp_check_value(type, a));
317
318 if(a == bld->one)
319 return bld->zero;
320 if(a == bld->zero)
321 return bld->one;
322
323 if(type.norm && !type.floating && !type.fixed && !type.sign) {
324 if(LLVMIsConstant(a))
325 return LLVMConstNot(a);
326 else
327 return LLVMBuildNot(builder, a, "");
328 }
329
330 if(LLVMIsConstant(a))
331 if (type.floating)
332 return LLVMConstFSub(bld->one, a);
333 else
334 return LLVMConstSub(bld->one, a);
335 else
336 if (type.floating)
337 return LLVMBuildFSub(builder, bld->one, a, "");
338 else
339 return LLVMBuildSub(builder, bld->one, a, "");
340 }
341
342
343 /**
344 * Generate a + b
345 */
346 LLVMValueRef
347 lp_build_add(struct lp_build_context *bld,
348 LLVMValueRef a,
349 LLVMValueRef b)
350 {
351 LLVMBuilderRef builder = bld->gallivm->builder;
352 const struct lp_type type = bld->type;
353 LLVMValueRef res;
354
355 assert(lp_check_value(type, a));
356 assert(lp_check_value(type, b));
357
358 if(a == bld->zero)
359 return b;
360 if(b == bld->zero)
361 return a;
362 if(a == bld->undef || b == bld->undef)
363 return bld->undef;
364
365 if(bld->type.norm) {
366 const char *intrinsic = NULL;
367
368 if(a == bld->one || b == bld->one)
369 return bld->one;
370
371 if (type.width * type.length == 128 &&
372 !type.floating && !type.fixed) {
373 if(util_cpu_caps.has_sse2) {
374 if(type.width == 8)
375 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
376 if(type.width == 16)
377 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
378 } else if (util_cpu_caps.has_altivec) {
379 if(type.width == 8)
380 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
381 if(type.width == 16)
382 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
383 }
384 }
385
386 if(intrinsic)
387 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
388 }
389
390 /* TODO: handle signed case */
391 if(type.norm && !type.floating && !type.fixed && !type.sign)
392 a = lp_build_min_simple(bld, a, lp_build_comp(bld, b));
393
394 if(LLVMIsConstant(a) && LLVMIsConstant(b))
395 if (type.floating)
396 res = LLVMConstFAdd(a, b);
397 else
398 res = LLVMConstAdd(a, b);
399 else
400 if (type.floating)
401 res = LLVMBuildFAdd(builder, a, b, "");
402 else
403 res = LLVMBuildAdd(builder, a, b, "");
404
405 /* clamp to ceiling of 1.0 */
406 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
407 res = lp_build_min_simple(bld, res, bld->one);
408
409 /* XXX clamp to floor of -1 or 0??? */
410
411 return res;
412 }
413
414
415 /** Return the scalar sum of the elements of a.
416 * Should avoid this operation whenever possible.
417 */
418 LLVMValueRef
419 lp_build_horizontal_add(struct lp_build_context *bld,
420 LLVMValueRef a)
421 {
422 LLVMBuilderRef builder = bld->gallivm->builder;
423 const struct lp_type type = bld->type;
424 LLVMValueRef index, res;
425 unsigned i, length;
426 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
427 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
428 LLVMValueRef vecres, elem2;
429
430 assert(lp_check_value(type, a));
431
432 if (type.length == 1) {
433 return a;
434 }
435
436 assert(!bld->type.norm);
437
438 /*
439 * for byte vectors can do much better with psadbw.
440 * Using repeated shuffle/adds here. Note with multiple vectors
441 * this can be done more efficiently as outlined in the intel
442 * optimization manual.
443 * Note: could cause data rearrangement if used with smaller element
444 * sizes.
445 */
446
447 vecres = a;
448 length = type.length / 2;
449 while (length > 1) {
450 LLVMValueRef vec1, vec2;
451 for (i = 0; i < length; i++) {
452 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
453 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
454 }
455 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
456 LLVMConstVector(shuffles1, length), "");
457 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
458 LLVMConstVector(shuffles2, length), "");
459 if (type.floating) {
460 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
461 }
462 else {
463 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
464 }
465 length = length >> 1;
466 }
467
468 /* always have vector of size 2 here */
469 assert(length == 1);
470
471 index = lp_build_const_int32(bld->gallivm, 0);
472 res = LLVMBuildExtractElement(builder, vecres, index, "");
473 index = lp_build_const_int32(bld->gallivm, 1);
474 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
475
476 if (type.floating)
477 res = LLVMBuildFAdd(builder, res, elem2, "");
478 else
479 res = LLVMBuildAdd(builder, res, elem2, "");
480
481 return res;
482 }
483
484 /**
485 * Return the horizontal sums of 4 float vectors as a float4 vector.
486 * This uses the technique as outlined in Intel Optimization Manual.
487 */
488 static LLVMValueRef
489 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
490 LLVMValueRef src[4])
491 {
492 struct gallivm_state *gallivm = bld->gallivm;
493 LLVMBuilderRef builder = gallivm->builder;
494 LLVMValueRef shuffles[4];
495 LLVMValueRef tmp[4];
496 LLVMValueRef sumtmp[2], shuftmp[2];
497
498 /* lower half of regs */
499 shuffles[0] = lp_build_const_int32(gallivm, 0);
500 shuffles[1] = lp_build_const_int32(gallivm, 1);
501 shuffles[2] = lp_build_const_int32(gallivm, 4);
502 shuffles[3] = lp_build_const_int32(gallivm, 5);
503 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
504 LLVMConstVector(shuffles, 4), "");
505 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
506 LLVMConstVector(shuffles, 4), "");
507
508 /* upper half of regs */
509 shuffles[0] = lp_build_const_int32(gallivm, 2);
510 shuffles[1] = lp_build_const_int32(gallivm, 3);
511 shuffles[2] = lp_build_const_int32(gallivm, 6);
512 shuffles[3] = lp_build_const_int32(gallivm, 7);
513 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
514 LLVMConstVector(shuffles, 4), "");
515 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
516 LLVMConstVector(shuffles, 4), "");
517
518 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
519 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
520
521 shuffles[0] = lp_build_const_int32(gallivm, 0);
522 shuffles[1] = lp_build_const_int32(gallivm, 2);
523 shuffles[2] = lp_build_const_int32(gallivm, 4);
524 shuffles[3] = lp_build_const_int32(gallivm, 6);
525 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
526 LLVMConstVector(shuffles, 4), "");
527
528 shuffles[0] = lp_build_const_int32(gallivm, 1);
529 shuffles[1] = lp_build_const_int32(gallivm, 3);
530 shuffles[2] = lp_build_const_int32(gallivm, 5);
531 shuffles[3] = lp_build_const_int32(gallivm, 7);
532 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
533 LLVMConstVector(shuffles, 4), "");
534
535 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
536 }
537
538
539 /*
540 * partially horizontally add 2-4 float vectors with length nx4,
541 * i.e. only four adjacent values in each vector will be added,
542 * assuming values are really grouped in 4 which also determines
543 * output order.
544 *
545 * Return a vector of the same length as the initial vectors,
546 * with the excess elements (if any) being undefined.
547 * The element order is independent of number of input vectors.
548 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
549 * the output order thus will be
550 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
551 */
552 LLVMValueRef
553 lp_build_hadd_partial4(struct lp_build_context *bld,
554 LLVMValueRef vectors[],
555 unsigned num_vecs)
556 {
557 struct gallivm_state *gallivm = bld->gallivm;
558 LLVMBuilderRef builder = gallivm->builder;
559 LLVMValueRef ret_vec;
560 LLVMValueRef tmp[4];
561 const char *intrinsic = NULL;
562
563 assert(num_vecs >= 2 && num_vecs <= 4);
564 assert(bld->type.floating);
565
566 /* only use this with at least 2 vectors, as it is sort of expensive
567 * (depending on cpu) and we always need two horizontal adds anyway,
568 * so a shuffle/add approach might be better.
569 */
570
571 tmp[0] = vectors[0];
572 tmp[1] = vectors[1];
573
574 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
575 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
576
577 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
578 bld->type.length == 4) {
579 intrinsic = "llvm.x86.sse3.hadd.ps";
580 }
581 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
582 bld->type.length == 8) {
583 intrinsic = "llvm.x86.avx.hadd.ps.256";
584 }
585 if (intrinsic) {
586 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
587 lp_build_vec_type(gallivm, bld->type),
588 tmp[0], tmp[1]);
589 if (num_vecs > 2) {
590 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
591 lp_build_vec_type(gallivm, bld->type),
592 tmp[2], tmp[3]);
593 }
594 else {
595 tmp[1] = tmp[0];
596 }
597 return lp_build_intrinsic_binary(builder, intrinsic,
598 lp_build_vec_type(gallivm, bld->type),
599 tmp[0], tmp[1]);
600 }
601
602 if (bld->type.length == 4) {
603 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
604 }
605 else {
606 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
607 unsigned j;
608 unsigned num_iter = bld->type.length / 4;
609 struct lp_type parttype = bld->type;
610 parttype.length = 4;
611 for (j = 0; j < num_iter; j++) {
612 LLVMValueRef partsrc[4];
613 unsigned i;
614 for (i = 0; i < 4; i++) {
615 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
616 }
617 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
618 }
619 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
620 }
621 return ret_vec;
622 }
623
624 /**
625 * Generate a - b
626 */
627 LLVMValueRef
628 lp_build_sub(struct lp_build_context *bld,
629 LLVMValueRef a,
630 LLVMValueRef b)
631 {
632 LLVMBuilderRef builder = bld->gallivm->builder;
633 const struct lp_type type = bld->type;
634 LLVMValueRef res;
635
636 assert(lp_check_value(type, a));
637 assert(lp_check_value(type, b));
638
639 if(b == bld->zero)
640 return a;
641 if(a == bld->undef || b == bld->undef)
642 return bld->undef;
643 if(a == b)
644 return bld->zero;
645
646 if(bld->type.norm) {
647 const char *intrinsic = NULL;
648
649 if(b == bld->one)
650 return bld->zero;
651
652 if (type.width * type.length == 128 &&
653 !type.floating && !type.fixed) {
654 if (util_cpu_caps.has_sse2) {
655 if(type.width == 8)
656 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
657 if(type.width == 16)
658 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
659 } else if (util_cpu_caps.has_altivec) {
660 if(type.width == 8)
661 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
662 if(type.width == 16)
663 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
664 }
665 }
666
667 if(intrinsic)
668 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
669 }
670
671 /* TODO: handle signed case */
672 if(type.norm && !type.floating && !type.fixed && !type.sign)
673 a = lp_build_max_simple(bld, a, b);
674
675 if(LLVMIsConstant(a) && LLVMIsConstant(b))
676 if (type.floating)
677 res = LLVMConstFSub(a, b);
678 else
679 res = LLVMConstSub(a, b);
680 else
681 if (type.floating)
682 res = LLVMBuildFSub(builder, a, b, "");
683 else
684 res = LLVMBuildSub(builder, a, b, "");
685
686 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
687 res = lp_build_max_simple(bld, res, bld->zero);
688
689 return res;
690 }
691
692
693
694 /**
695 * Normalized multiplication.
696 *
697 * There are several approaches for (using 8-bit normalized multiplication as
698 * an example):
699 *
700 * - alpha plus one
701 *
702 * makes the following approximation to the division (Sree)
703 *
704 * a*b/255 ~= (a*(b + 1)) >> 256
705 *
706 * which is the fastest method that satisfies the following OpenGL criteria of
707 *
708 * 0*0 = 0 and 255*255 = 255
709 *
710 * - geometric series
711 *
712 * takes the geometric series approximation to the division
713 *
714 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
715 *
716 * in this case just the first two terms to fit in 16bit arithmetic
717 *
718 * t/255 ~= (t + (t >> 8)) >> 8
719 *
720 * note that just by itself it doesn't satisfies the OpenGL criteria, as
721 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
722 * must be used.
723 *
724 * - geometric series plus rounding
725 *
726 * when using a geometric series division instead of truncating the result
727 * use roundoff in the approximation (Jim Blinn)
728 *
729 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
730 *
731 * achieving the exact results.
732 *
733 *
734 *
735 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
736 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
737 * @sa Michael Herf, The "double blend trick", May 2000,
738 * http://www.stereopsis.com/doubleblend.html
739 */
740 static LLVMValueRef
741 lp_build_mul_norm(struct gallivm_state *gallivm,
742 struct lp_type wide_type,
743 LLVMValueRef a, LLVMValueRef b)
744 {
745 LLVMBuilderRef builder = gallivm->builder;
746 struct lp_build_context bld;
747 unsigned n;
748 LLVMValueRef half;
749 LLVMValueRef ab;
750
751 assert(!wide_type.floating);
752 assert(lp_check_value(wide_type, a));
753 assert(lp_check_value(wide_type, b));
754
755 lp_build_context_init(&bld, gallivm, wide_type);
756
757 n = wide_type.width / 2;
758 if (wide_type.sign) {
759 --n;
760 }
761
762 /*
763 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
764 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
765 */
766
767 /*
768 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
769 */
770
771 ab = LLVMBuildMul(builder, a, b, "");
772 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
773
774 /*
775 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
776 */
777
778 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
779 if (wide_type.sign) {
780 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
781 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
782 half = lp_build_select(&bld, sign, minus_half, half);
783 }
784 ab = LLVMBuildAdd(builder, ab, half, "");
785
786 /* Final division */
787 ab = lp_build_shr_imm(&bld, ab, n);
788
789 return ab;
790 }
791
792 /**
793 * Generate a * b
794 */
795 LLVMValueRef
796 lp_build_mul(struct lp_build_context *bld,
797 LLVMValueRef a,
798 LLVMValueRef b)
799 {
800 LLVMBuilderRef builder = bld->gallivm->builder;
801 const struct lp_type type = bld->type;
802 LLVMValueRef shift;
803 LLVMValueRef res;
804
805 assert(lp_check_value(type, a));
806 assert(lp_check_value(type, b));
807
808 if(a == bld->zero)
809 return bld->zero;
810 if(a == bld->one)
811 return b;
812 if(b == bld->zero)
813 return bld->zero;
814 if(b == bld->one)
815 return a;
816 if(a == bld->undef || b == bld->undef)
817 return bld->undef;
818
819 if (!type.floating && !type.fixed && type.norm) {
820 struct lp_type wide_type = lp_wider_type(type);
821 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
822
823 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
824 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
825
826 /* PMULLW, PSRLW, PADDW */
827 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
828 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
829
830 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
831
832 return ab;
833 }
834
835 if(type.fixed)
836 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
837 else
838 shift = NULL;
839
840 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
841 if (type.floating)
842 res = LLVMConstFMul(a, b);
843 else
844 res = LLVMConstMul(a, b);
845 if(shift) {
846 if(type.sign)
847 res = LLVMConstAShr(res, shift);
848 else
849 res = LLVMConstLShr(res, shift);
850 }
851 }
852 else {
853 if (type.floating)
854 res = LLVMBuildFMul(builder, a, b, "");
855 else
856 res = LLVMBuildMul(builder, a, b, "");
857 if(shift) {
858 if(type.sign)
859 res = LLVMBuildAShr(builder, res, shift, "");
860 else
861 res = LLVMBuildLShr(builder, res, shift, "");
862 }
863 }
864
865 return res;
866 }
867
868
869 /**
870 * Small vector x scale multiplication optimization.
871 */
872 LLVMValueRef
873 lp_build_mul_imm(struct lp_build_context *bld,
874 LLVMValueRef a,
875 int b)
876 {
877 LLVMBuilderRef builder = bld->gallivm->builder;
878 LLVMValueRef factor;
879
880 assert(lp_check_value(bld->type, a));
881
882 if(b == 0)
883 return bld->zero;
884
885 if(b == 1)
886 return a;
887
888 if(b == -1)
889 return lp_build_negate(bld, a);
890
891 if(b == 2 && bld->type.floating)
892 return lp_build_add(bld, a, a);
893
894 if(util_is_power_of_two(b)) {
895 unsigned shift = ffs(b) - 1;
896
897 if(bld->type.floating) {
898 #if 0
899 /*
900 * Power of two multiplication by directly manipulating the exponent.
901 *
902 * XXX: This might not be always faster, it will introduce a small error
903 * for multiplication by zero, and it will produce wrong results
904 * for Inf and NaN.
905 */
906 unsigned mantissa = lp_mantissa(bld->type);
907 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
908 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
909 a = LLVMBuildAdd(builder, a, factor, "");
910 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
911 return a;
912 #endif
913 }
914 else {
915 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
916 return LLVMBuildShl(builder, a, factor, "");
917 }
918 }
919
920 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
921 return lp_build_mul(bld, a, factor);
922 }
923
924
925 /**
926 * Generate a / b
927 */
928 LLVMValueRef
929 lp_build_div(struct lp_build_context *bld,
930 LLVMValueRef a,
931 LLVMValueRef b)
932 {
933 LLVMBuilderRef builder = bld->gallivm->builder;
934 const struct lp_type type = bld->type;
935
936 assert(lp_check_value(type, a));
937 assert(lp_check_value(type, b));
938
939 if(a == bld->zero)
940 return bld->zero;
941 if(a == bld->one)
942 return lp_build_rcp(bld, b);
943 if(b == bld->zero)
944 return bld->undef;
945 if(b == bld->one)
946 return a;
947 if(a == bld->undef || b == bld->undef)
948 return bld->undef;
949
950 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
951 if (type.floating)
952 return LLVMConstFDiv(a, b);
953 else if (type.sign)
954 return LLVMConstSDiv(a, b);
955 else
956 return LLVMConstUDiv(a, b);
957 }
958
959 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
960 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
961 type.floating)
962 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
963
964 if (type.floating)
965 return LLVMBuildFDiv(builder, a, b, "");
966 else if (type.sign)
967 return LLVMBuildSDiv(builder, a, b, "");
968 else
969 return LLVMBuildUDiv(builder, a, b, "");
970 }
971
972
973 /**
974 * Linear interpolation helper.
975 *
976 * @param normalized whether we are interpolating normalized values,
977 * encoded in normalized integers, twice as wide.
978 *
979 * @sa http://www.stereopsis.com/doubleblend.html
980 */
981 static INLINE LLVMValueRef
982 lp_build_lerp_simple(struct lp_build_context *bld,
983 LLVMValueRef x,
984 LLVMValueRef v0,
985 LLVMValueRef v1,
986 unsigned flags)
987 {
988 unsigned half_width = bld->type.width/2;
989 LLVMBuilderRef builder = bld->gallivm->builder;
990 LLVMValueRef delta;
991 LLVMValueRef res;
992
993 assert(lp_check_value(bld->type, x));
994 assert(lp_check_value(bld->type, v0));
995 assert(lp_check_value(bld->type, v1));
996
997 delta = lp_build_sub(bld, v1, v0);
998
999 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1000 if (!bld->type.sign) {
1001 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1002 /*
1003 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1004 * most-significant-bit to the lowest-significant-bit, so that
1005 * later we can just divide by 2**n instead of 2**n - 1.
1006 */
1007
1008 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1009 }
1010
1011 /* (x * delta) >> n */
1012 res = lp_build_mul(bld, x, delta);
1013 res = lp_build_shr_imm(bld, res, half_width);
1014 } else {
1015 /*
1016 * The rescaling trick above doesn't work for signed numbers, so
1017 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1018 * instead.
1019 */
1020 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1021 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1022 }
1023 } else {
1024 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1025 res = lp_build_mul(bld, x, delta);
1026 }
1027
1028 res = lp_build_add(bld, v0, res);
1029
1030 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1031 bld->type.fixed) {
1032 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1033 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1034 * but it will be wrong for true fixed point use cases. Basically we need
1035 * a more powerful lp_type, capable of further distinguishing the values
1036 * interpretation from the value storage. */
1037 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1038 }
1039
1040 return res;
1041 }
1042
1043
1044 /**
1045 * Linear interpolation.
1046 */
1047 LLVMValueRef
1048 lp_build_lerp(struct lp_build_context *bld,
1049 LLVMValueRef x,
1050 LLVMValueRef v0,
1051 LLVMValueRef v1,
1052 unsigned flags)
1053 {
1054 const struct lp_type type = bld->type;
1055 LLVMValueRef res;
1056
1057 assert(lp_check_value(type, x));
1058 assert(lp_check_value(type, v0));
1059 assert(lp_check_value(type, v1));
1060
1061 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1062
1063 if (type.norm) {
1064 struct lp_type wide_type;
1065 struct lp_build_context wide_bld;
1066 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1067
1068 assert(type.length >= 2);
1069
1070 /*
1071 * Create a wider integer type, enough to hold the
1072 * intermediate result of the multiplication.
1073 */
1074 memset(&wide_type, 0, sizeof wide_type);
1075 wide_type.sign = type.sign;
1076 wide_type.width = type.width*2;
1077 wide_type.length = type.length/2;
1078
1079 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1080
1081 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1082 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1083 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1084
1085 /*
1086 * Lerp both halves.
1087 */
1088
1089 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1090
1091 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1092 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1093
1094 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1095 } else {
1096 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1097 }
1098
1099 return res;
1100 }
1101
1102
1103 /**
1104 * Bilinear interpolation.
1105 *
1106 * Values indices are in v_{yx}.
1107 */
1108 LLVMValueRef
1109 lp_build_lerp_2d(struct lp_build_context *bld,
1110 LLVMValueRef x,
1111 LLVMValueRef y,
1112 LLVMValueRef v00,
1113 LLVMValueRef v01,
1114 LLVMValueRef v10,
1115 LLVMValueRef v11,
1116 unsigned flags)
1117 {
1118 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1119 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1120 return lp_build_lerp(bld, y, v0, v1, flags);
1121 }
1122
1123
1124 LLVMValueRef
1125 lp_build_lerp_3d(struct lp_build_context *bld,
1126 LLVMValueRef x,
1127 LLVMValueRef y,
1128 LLVMValueRef z,
1129 LLVMValueRef v000,
1130 LLVMValueRef v001,
1131 LLVMValueRef v010,
1132 LLVMValueRef v011,
1133 LLVMValueRef v100,
1134 LLVMValueRef v101,
1135 LLVMValueRef v110,
1136 LLVMValueRef v111,
1137 unsigned flags)
1138 {
1139 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1140 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1141 return lp_build_lerp(bld, z, v0, v1, flags);
1142 }
1143
1144
1145 /**
1146 * Generate min(a, b)
1147 * Do checks for special cases.
1148 */
1149 LLVMValueRef
1150 lp_build_min(struct lp_build_context *bld,
1151 LLVMValueRef a,
1152 LLVMValueRef b)
1153 {
1154 assert(lp_check_value(bld->type, a));
1155 assert(lp_check_value(bld->type, b));
1156
1157 if(a == bld->undef || b == bld->undef)
1158 return bld->undef;
1159
1160 if(a == b)
1161 return a;
1162
1163 if (bld->type.norm) {
1164 if (!bld->type.sign) {
1165 if (a == bld->zero || b == bld->zero) {
1166 return bld->zero;
1167 }
1168 }
1169 if(a == bld->one)
1170 return b;
1171 if(b == bld->one)
1172 return a;
1173 }
1174
1175 return lp_build_min_simple(bld, a, b);
1176 }
1177
1178
1179 /**
1180 * Generate max(a, b)
1181 * Do checks for special cases.
1182 */
1183 LLVMValueRef
1184 lp_build_max(struct lp_build_context *bld,
1185 LLVMValueRef a,
1186 LLVMValueRef b)
1187 {
1188 assert(lp_check_value(bld->type, a));
1189 assert(lp_check_value(bld->type, b));
1190
1191 if(a == bld->undef || b == bld->undef)
1192 return bld->undef;
1193
1194 if(a == b)
1195 return a;
1196
1197 if(bld->type.norm) {
1198 if(a == bld->one || b == bld->one)
1199 return bld->one;
1200 if (!bld->type.sign) {
1201 if (a == bld->zero) {
1202 return b;
1203 }
1204 if (b == bld->zero) {
1205 return a;
1206 }
1207 }
1208 }
1209
1210 return lp_build_max_simple(bld, a, b);
1211 }
1212
1213
1214 /**
1215 * Generate clamp(a, min, max)
1216 * Do checks for special cases.
1217 */
1218 LLVMValueRef
1219 lp_build_clamp(struct lp_build_context *bld,
1220 LLVMValueRef a,
1221 LLVMValueRef min,
1222 LLVMValueRef max)
1223 {
1224 assert(lp_check_value(bld->type, a));
1225 assert(lp_check_value(bld->type, min));
1226 assert(lp_check_value(bld->type, max));
1227
1228 a = lp_build_min(bld, a, max);
1229 a = lp_build_max(bld, a, min);
1230 return a;
1231 }
1232
1233
1234 /**
1235 * Generate abs(a)
1236 */
1237 LLVMValueRef
1238 lp_build_abs(struct lp_build_context *bld,
1239 LLVMValueRef a)
1240 {
1241 LLVMBuilderRef builder = bld->gallivm->builder;
1242 const struct lp_type type = bld->type;
1243 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1244
1245 assert(lp_check_value(type, a));
1246
1247 if(!type.sign)
1248 return a;
1249
1250 if(type.floating) {
1251 /* Mask out the sign bit */
1252 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1253 unsigned long long absMask = ~(1ULL << (type.width - 1));
1254 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1255 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1256 a = LLVMBuildAnd(builder, a, mask, "");
1257 a = LLVMBuildBitCast(builder, a, vec_type, "");
1258 return a;
1259 }
1260
1261 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1262 switch(type.width) {
1263 case 8:
1264 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1265 case 16:
1266 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1267 case 32:
1268 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1269 }
1270 }
1271 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1272 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1273 (type.width == 8 || type.width == 16 || type.width == 32)) {
1274 debug_printf("%s: inefficient code, should split vectors manually\n",
1275 __FUNCTION__);
1276 }
1277
1278 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1279 }
1280
1281
1282 LLVMValueRef
1283 lp_build_negate(struct lp_build_context *bld,
1284 LLVMValueRef a)
1285 {
1286 LLVMBuilderRef builder = bld->gallivm->builder;
1287
1288 assert(lp_check_value(bld->type, a));
1289
1290 #if HAVE_LLVM >= 0x0207
1291 if (bld->type.floating)
1292 a = LLVMBuildFNeg(builder, a, "");
1293 else
1294 #endif
1295 a = LLVMBuildNeg(builder, a, "");
1296
1297 return a;
1298 }
1299
1300
1301 /** Return -1, 0 or +1 depending on the sign of a */
1302 LLVMValueRef
1303 lp_build_sgn(struct lp_build_context *bld,
1304 LLVMValueRef a)
1305 {
1306 LLVMBuilderRef builder = bld->gallivm->builder;
1307 const struct lp_type type = bld->type;
1308 LLVMValueRef cond;
1309 LLVMValueRef res;
1310
1311 assert(lp_check_value(type, a));
1312
1313 /* Handle non-zero case */
1314 if(!type.sign) {
1315 /* if not zero then sign must be positive */
1316 res = bld->one;
1317 }
1318 else if(type.floating) {
1319 LLVMTypeRef vec_type;
1320 LLVMTypeRef int_type;
1321 LLVMValueRef mask;
1322 LLVMValueRef sign;
1323 LLVMValueRef one;
1324 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1325
1326 int_type = lp_build_int_vec_type(bld->gallivm, type);
1327 vec_type = lp_build_vec_type(bld->gallivm, type);
1328 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1329
1330 /* Take the sign bit and add it to 1 constant */
1331 sign = LLVMBuildBitCast(builder, a, int_type, "");
1332 sign = LLVMBuildAnd(builder, sign, mask, "");
1333 one = LLVMConstBitCast(bld->one, int_type);
1334 res = LLVMBuildOr(builder, sign, one, "");
1335 res = LLVMBuildBitCast(builder, res, vec_type, "");
1336 }
1337 else
1338 {
1339 /* signed int/norm/fixed point */
1340 /* could use psign with sse3 and appropriate vectors here */
1341 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1342 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1343 res = lp_build_select(bld, cond, bld->one, minus_one);
1344 }
1345
1346 /* Handle zero */
1347 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1348 res = lp_build_select(bld, cond, bld->zero, res);
1349
1350 return res;
1351 }
1352
1353
1354 /**
1355 * Set the sign of float vector 'a' according to 'sign'.
1356 * If sign==0, return abs(a).
1357 * If sign==1, return -abs(a);
1358 * Other values for sign produce undefined results.
1359 */
1360 LLVMValueRef
1361 lp_build_set_sign(struct lp_build_context *bld,
1362 LLVMValueRef a, LLVMValueRef sign)
1363 {
1364 LLVMBuilderRef builder = bld->gallivm->builder;
1365 const struct lp_type type = bld->type;
1366 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1367 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1368 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1369 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1370 ~((unsigned long long) 1 << (type.width - 1)));
1371 LLVMValueRef val, res;
1372
1373 assert(type.floating);
1374 assert(lp_check_value(type, a));
1375
1376 /* val = reinterpret_cast<int>(a) */
1377 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1378 /* val = val & mask */
1379 val = LLVMBuildAnd(builder, val, mask, "");
1380 /* sign = sign << shift */
1381 sign = LLVMBuildShl(builder, sign, shift, "");
1382 /* res = val | sign */
1383 res = LLVMBuildOr(builder, val, sign, "");
1384 /* res = reinterpret_cast<float>(res) */
1385 res = LLVMBuildBitCast(builder, res, vec_type, "");
1386
1387 return res;
1388 }
1389
1390
1391 /**
1392 * Convert vector of (or scalar) int to vector of (or scalar) float.
1393 */
1394 LLVMValueRef
1395 lp_build_int_to_float(struct lp_build_context *bld,
1396 LLVMValueRef a)
1397 {
1398 LLVMBuilderRef builder = bld->gallivm->builder;
1399 const struct lp_type type = bld->type;
1400 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1401
1402 assert(type.floating);
1403
1404 return LLVMBuildSIToFP(builder, a, vec_type, "");
1405 }
1406
1407 static boolean
1408 arch_rounding_available(const struct lp_type type)
1409 {
1410 if ((util_cpu_caps.has_sse4_1 &&
1411 (type.length == 1 || type.width*type.length == 128)) ||
1412 (util_cpu_caps.has_avx && type.width*type.length == 256))
1413 return TRUE;
1414 else if ((util_cpu_caps.has_altivec &&
1415 (type.width == 32 && type.length == 4)))
1416 return TRUE;
1417
1418 return FALSE;
1419 }
1420
1421 enum lp_build_round_mode
1422 {
1423 LP_BUILD_ROUND_NEAREST = 0,
1424 LP_BUILD_ROUND_FLOOR = 1,
1425 LP_BUILD_ROUND_CEIL = 2,
1426 LP_BUILD_ROUND_TRUNCATE = 3
1427 };
1428
1429 /**
1430 * Helper for SSE4.1's ROUNDxx instructions.
1431 *
1432 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1433 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1434 */
1435 static INLINE LLVMValueRef
1436 lp_build_round_sse41(struct lp_build_context *bld,
1437 LLVMValueRef a,
1438 enum lp_build_round_mode mode)
1439 {
1440 LLVMBuilderRef builder = bld->gallivm->builder;
1441 const struct lp_type type = bld->type;
1442 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1443 const char *intrinsic;
1444 LLVMValueRef res;
1445
1446 assert(type.floating);
1447
1448 assert(lp_check_value(type, a));
1449 assert(util_cpu_caps.has_sse4_1);
1450
1451 if (type.length == 1) {
1452 LLVMTypeRef vec_type;
1453 LLVMValueRef undef;
1454 LLVMValueRef args[3];
1455 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1456
1457 switch(type.width) {
1458 case 32:
1459 intrinsic = "llvm.x86.sse41.round.ss";
1460 break;
1461 case 64:
1462 intrinsic = "llvm.x86.sse41.round.sd";
1463 break;
1464 default:
1465 assert(0);
1466 return bld->undef;
1467 }
1468
1469 vec_type = LLVMVectorType(bld->elem_type, 4);
1470
1471 undef = LLVMGetUndef(vec_type);
1472
1473 args[0] = undef;
1474 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1475 args[2] = LLVMConstInt(i32t, mode, 0);
1476
1477 res = lp_build_intrinsic(builder, intrinsic,
1478 vec_type, args, Elements(args));
1479
1480 res = LLVMBuildExtractElement(builder, res, index0, "");
1481 }
1482 else {
1483 if (type.width * type.length == 128) {
1484 switch(type.width) {
1485 case 32:
1486 intrinsic = "llvm.x86.sse41.round.ps";
1487 break;
1488 case 64:
1489 intrinsic = "llvm.x86.sse41.round.pd";
1490 break;
1491 default:
1492 assert(0);
1493 return bld->undef;
1494 }
1495 }
1496 else {
1497 assert(type.width * type.length == 256);
1498 assert(util_cpu_caps.has_avx);
1499
1500 switch(type.width) {
1501 case 32:
1502 intrinsic = "llvm.x86.avx.round.ps.256";
1503 break;
1504 case 64:
1505 intrinsic = "llvm.x86.avx.round.pd.256";
1506 break;
1507 default:
1508 assert(0);
1509 return bld->undef;
1510 }
1511 }
1512
1513 res = lp_build_intrinsic_binary(builder, intrinsic,
1514 bld->vec_type, a,
1515 LLVMConstInt(i32t, mode, 0));
1516 }
1517
1518 return res;
1519 }
1520
1521
1522 static INLINE LLVMValueRef
1523 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1524 LLVMValueRef a)
1525 {
1526 LLVMBuilderRef builder = bld->gallivm->builder;
1527 const struct lp_type type = bld->type;
1528 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1529 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1530 const char *intrinsic;
1531 LLVMValueRef res;
1532
1533 assert(type.floating);
1534 /* using the double precision conversions is a bit more complicated */
1535 assert(type.width == 32);
1536
1537 assert(lp_check_value(type, a));
1538 assert(util_cpu_caps.has_sse2);
1539
1540 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1541 if (type.length == 1) {
1542 LLVMTypeRef vec_type;
1543 LLVMValueRef undef;
1544 LLVMValueRef arg;
1545 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1546
1547 vec_type = LLVMVectorType(bld->elem_type, 4);
1548
1549 intrinsic = "llvm.x86.sse.cvtss2si";
1550
1551 undef = LLVMGetUndef(vec_type);
1552
1553 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1554
1555 res = lp_build_intrinsic_unary(builder, intrinsic,
1556 ret_type, arg);
1557 }
1558 else {
1559 if (type.width* type.length == 128) {
1560 intrinsic = "llvm.x86.sse2.cvtps2dq";
1561 }
1562 else {
1563 assert(type.width*type.length == 256);
1564 assert(util_cpu_caps.has_avx);
1565
1566 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1567 }
1568 res = lp_build_intrinsic_unary(builder, intrinsic,
1569 ret_type, a);
1570 }
1571
1572 return res;
1573 }
1574
1575
1576 /*
1577 */
1578 static INLINE LLVMValueRef
1579 lp_build_round_altivec(struct lp_build_context *bld,
1580 LLVMValueRef a,
1581 enum lp_build_round_mode mode)
1582 {
1583 LLVMBuilderRef builder = bld->gallivm->builder;
1584 const struct lp_type type = bld->type;
1585 const char *intrinsic = NULL;
1586
1587 assert(type.floating);
1588
1589 assert(lp_check_value(type, a));
1590 assert(util_cpu_caps.has_altivec);
1591
1592 switch (mode) {
1593 case LP_BUILD_ROUND_NEAREST:
1594 intrinsic = "llvm.ppc.altivec.vrfin";
1595 break;
1596 case LP_BUILD_ROUND_FLOOR:
1597 intrinsic = "llvm.ppc.altivec.vrfim";
1598 break;
1599 case LP_BUILD_ROUND_CEIL:
1600 intrinsic = "llvm.ppc.altivec.vrfip";
1601 break;
1602 case LP_BUILD_ROUND_TRUNCATE:
1603 intrinsic = "llvm.ppc.altivec.vrfiz";
1604 break;
1605 }
1606
1607 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1608 }
1609
1610 static INLINE LLVMValueRef
1611 lp_build_round_arch(struct lp_build_context *bld,
1612 LLVMValueRef a,
1613 enum lp_build_round_mode mode)
1614 {
1615 if (util_cpu_caps.has_sse4_1)
1616 return lp_build_round_sse41(bld, a, mode);
1617 else /* (util_cpu_caps.has_altivec) */
1618 return lp_build_round_altivec(bld, a, mode);
1619 }
1620
1621 /**
1622 * Return the integer part of a float (vector) value (== round toward zero).
1623 * The returned value is a float (vector).
1624 * Ex: trunc(-1.5) = -1.0
1625 */
1626 LLVMValueRef
1627 lp_build_trunc(struct lp_build_context *bld,
1628 LLVMValueRef a)
1629 {
1630 LLVMBuilderRef builder = bld->gallivm->builder;
1631 const struct lp_type type = bld->type;
1632
1633 assert(type.floating);
1634 assert(lp_check_value(type, a));
1635
1636 if (arch_rounding_available(type)) {
1637 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1638 }
1639 else {
1640 const struct lp_type type = bld->type;
1641 struct lp_type inttype;
1642 struct lp_build_context intbld;
1643 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1644 LLVMValueRef trunc, res, anosign, mask;
1645 LLVMTypeRef int_vec_type = bld->int_vec_type;
1646 LLVMTypeRef vec_type = bld->vec_type;
1647
1648 assert(type.width == 32); /* might want to handle doubles at some point */
1649
1650 inttype = type;
1651 inttype.floating = 0;
1652 lp_build_context_init(&intbld, bld->gallivm, inttype);
1653
1654 /* round by truncation */
1655 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1656 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1657
1658 /* mask out sign bit */
1659 anosign = lp_build_abs(bld, a);
1660 /*
1661 * mask out all values if anosign > 2^24
1662 * This should work both for large ints (all rounding is no-op for them
1663 * because such floats are always exact) as well as special cases like
1664 * NaNs, Infs (taking advantage of the fact they use max exponent).
1665 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1666 */
1667 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1668 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1669 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1670 return lp_build_select(bld, mask, a, res);
1671 }
1672 }
1673
1674
1675 /**
1676 * Return float (vector) rounded to nearest integer (vector). The returned
1677 * value is a float (vector).
1678 * Ex: round(0.9) = 1.0
1679 * Ex: round(-1.5) = -2.0
1680 */
1681 LLVMValueRef
1682 lp_build_round(struct lp_build_context *bld,
1683 LLVMValueRef a)
1684 {
1685 LLVMBuilderRef builder = bld->gallivm->builder;
1686 const struct lp_type type = bld->type;
1687
1688 assert(type.floating);
1689 assert(lp_check_value(type, a));
1690
1691 if (arch_rounding_available(type)) {
1692 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1693 }
1694 else {
1695 const struct lp_type type = bld->type;
1696 struct lp_type inttype;
1697 struct lp_build_context intbld;
1698 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1699 LLVMValueRef res, anosign, mask;
1700 LLVMTypeRef int_vec_type = bld->int_vec_type;
1701 LLVMTypeRef vec_type = bld->vec_type;
1702
1703 assert(type.width == 32); /* might want to handle doubles at some point */
1704
1705 inttype = type;
1706 inttype.floating = 0;
1707 lp_build_context_init(&intbld, bld->gallivm, inttype);
1708
1709 res = lp_build_iround(bld, a);
1710 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1711
1712 /* mask out sign bit */
1713 anosign = lp_build_abs(bld, a);
1714 /*
1715 * mask out all values if anosign > 2^24
1716 * This should work both for large ints (all rounding is no-op for them
1717 * because such floats are always exact) as well as special cases like
1718 * NaNs, Infs (taking advantage of the fact they use max exponent).
1719 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1720 */
1721 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1722 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1723 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1724 return lp_build_select(bld, mask, a, res);
1725 }
1726 }
1727
1728
1729 /**
1730 * Return floor of float (vector), result is a float (vector)
1731 * Ex: floor(1.1) = 1.0
1732 * Ex: floor(-1.1) = -2.0
1733 */
1734 LLVMValueRef
1735 lp_build_floor(struct lp_build_context *bld,
1736 LLVMValueRef a)
1737 {
1738 LLVMBuilderRef builder = bld->gallivm->builder;
1739 const struct lp_type type = bld->type;
1740
1741 assert(type.floating);
1742 assert(lp_check_value(type, a));
1743
1744 if (arch_rounding_available(type)) {
1745 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1746 }
1747 else {
1748 const struct lp_type type = bld->type;
1749 struct lp_type inttype;
1750 struct lp_build_context intbld;
1751 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1752 LLVMValueRef trunc, res, anosign, mask;
1753 LLVMTypeRef int_vec_type = bld->int_vec_type;
1754 LLVMTypeRef vec_type = bld->vec_type;
1755
1756 assert(type.width == 32); /* might want to handle doubles at some point */
1757
1758 inttype = type;
1759 inttype.floating = 0;
1760 lp_build_context_init(&intbld, bld->gallivm, inttype);
1761
1762 /* round by truncation */
1763 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1764 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1765
1766 if (type.sign) {
1767 LLVMValueRef tmp;
1768
1769 /*
1770 * fix values if rounding is wrong (for non-special cases)
1771 * - this is the case if trunc > a
1772 */
1773 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1774 /* tmp = trunc > a ? 1.0 : 0.0 */
1775 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1776 tmp = lp_build_and(&intbld, mask, tmp);
1777 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1778 res = lp_build_sub(bld, res, tmp);
1779 }
1780
1781 /* mask out sign bit */
1782 anosign = lp_build_abs(bld, a);
1783 /*
1784 * mask out all values if anosign > 2^24
1785 * This should work both for large ints (all rounding is no-op for them
1786 * because such floats are always exact) as well as special cases like
1787 * NaNs, Infs (taking advantage of the fact they use max exponent).
1788 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1789 */
1790 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1791 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1792 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1793 return lp_build_select(bld, mask, a, res);
1794 }
1795 }
1796
1797
1798 /**
1799 * Return ceiling of float (vector), returning float (vector).
1800 * Ex: ceil( 1.1) = 2.0
1801 * Ex: ceil(-1.1) = -1.0
1802 */
1803 LLVMValueRef
1804 lp_build_ceil(struct lp_build_context *bld,
1805 LLVMValueRef a)
1806 {
1807 LLVMBuilderRef builder = bld->gallivm->builder;
1808 const struct lp_type type = bld->type;
1809
1810 assert(type.floating);
1811 assert(lp_check_value(type, a));
1812
1813 if (arch_rounding_available(type)) {
1814 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1815 }
1816 else {
1817 const struct lp_type type = bld->type;
1818 struct lp_type inttype;
1819 struct lp_build_context intbld;
1820 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1821 LLVMValueRef trunc, res, anosign, mask, tmp;
1822 LLVMTypeRef int_vec_type = bld->int_vec_type;
1823 LLVMTypeRef vec_type = bld->vec_type;
1824
1825 assert(type.width == 32); /* might want to handle doubles at some point */
1826
1827 inttype = type;
1828 inttype.floating = 0;
1829 lp_build_context_init(&intbld, bld->gallivm, inttype);
1830
1831 /* round by truncation */
1832 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1833 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
1834
1835 /*
1836 * fix values if rounding is wrong (for non-special cases)
1837 * - this is the case if trunc < a
1838 */
1839 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
1840 /* tmp = trunc < a ? 1.0 : 0.0 */
1841 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1842 tmp = lp_build_and(&intbld, mask, tmp);
1843 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1844 res = lp_build_add(bld, trunc, tmp);
1845
1846 /* mask out sign bit */
1847 anosign = lp_build_abs(bld, a);
1848 /*
1849 * mask out all values if anosign > 2^24
1850 * This should work both for large ints (all rounding is no-op for them
1851 * because such floats are always exact) as well as special cases like
1852 * NaNs, Infs (taking advantage of the fact they use max exponent).
1853 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1854 */
1855 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1856 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1857 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1858 return lp_build_select(bld, mask, a, res);
1859 }
1860 }
1861
1862
1863 /**
1864 * Return fractional part of 'a' computed as a - floor(a)
1865 * Typically used in texture coord arithmetic.
1866 */
1867 LLVMValueRef
1868 lp_build_fract(struct lp_build_context *bld,
1869 LLVMValueRef a)
1870 {
1871 assert(bld->type.floating);
1872 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1873 }
1874
1875
1876 /**
1877 * Prevent returning a fractional part of 1.0 for very small negative values of
1878 * 'a' by clamping against 0.99999(9).
1879 */
1880 static inline LLVMValueRef
1881 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1882 {
1883 LLVMValueRef max;
1884
1885 /* this is the largest number smaller than 1.0 representable as float */
1886 max = lp_build_const_vec(bld->gallivm, bld->type,
1887 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1888 return lp_build_min(bld, fract, max);
1889 }
1890
1891
1892 /**
1893 * Same as lp_build_fract, but guarantees that the result is always smaller
1894 * than one.
1895 */
1896 LLVMValueRef
1897 lp_build_fract_safe(struct lp_build_context *bld,
1898 LLVMValueRef a)
1899 {
1900 return clamp_fract(bld, lp_build_fract(bld, a));
1901 }
1902
1903
1904 /**
1905 * Return the integer part of a float (vector) value (== round toward zero).
1906 * The returned value is an integer (vector).
1907 * Ex: itrunc(-1.5) = -1
1908 */
1909 LLVMValueRef
1910 lp_build_itrunc(struct lp_build_context *bld,
1911 LLVMValueRef a)
1912 {
1913 LLVMBuilderRef builder = bld->gallivm->builder;
1914 const struct lp_type type = bld->type;
1915 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1916
1917 assert(type.floating);
1918 assert(lp_check_value(type, a));
1919
1920 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1921 }
1922
1923
1924 /**
1925 * Return float (vector) rounded to nearest integer (vector). The returned
1926 * value is an integer (vector).
1927 * Ex: iround(0.9) = 1
1928 * Ex: iround(-1.5) = -2
1929 */
1930 LLVMValueRef
1931 lp_build_iround(struct lp_build_context *bld,
1932 LLVMValueRef a)
1933 {
1934 LLVMBuilderRef builder = bld->gallivm->builder;
1935 const struct lp_type type = bld->type;
1936 LLVMTypeRef int_vec_type = bld->int_vec_type;
1937 LLVMValueRef res;
1938
1939 assert(type.floating);
1940
1941 assert(lp_check_value(type, a));
1942
1943 if ((util_cpu_caps.has_sse2 &&
1944 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1945 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1946 return lp_build_iround_nearest_sse2(bld, a);
1947 }
1948 if (arch_rounding_available(type)) {
1949 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1950 }
1951 else {
1952 LLVMValueRef half;
1953
1954 half = lp_build_const_vec(bld->gallivm, type, 0.5);
1955
1956 if (type.sign) {
1957 LLVMTypeRef vec_type = bld->vec_type;
1958 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1959 (unsigned long long)1 << (type.width - 1));
1960 LLVMValueRef sign;
1961
1962 /* get sign bit */
1963 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1964 sign = LLVMBuildAnd(builder, sign, mask, "");
1965
1966 /* sign * 0.5 */
1967 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1968 half = LLVMBuildOr(builder, sign, half, "");
1969 half = LLVMBuildBitCast(builder, half, vec_type, "");
1970 }
1971
1972 res = LLVMBuildFAdd(builder, a, half, "");
1973 }
1974
1975 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1976
1977 return res;
1978 }
1979
1980
1981 /**
1982 * Return floor of float (vector), result is an int (vector)
1983 * Ex: ifloor(1.1) = 1.0
1984 * Ex: ifloor(-1.1) = -2.0
1985 */
1986 LLVMValueRef
1987 lp_build_ifloor(struct lp_build_context *bld,
1988 LLVMValueRef a)
1989 {
1990 LLVMBuilderRef builder = bld->gallivm->builder;
1991 const struct lp_type type = bld->type;
1992 LLVMTypeRef int_vec_type = bld->int_vec_type;
1993 LLVMValueRef res;
1994
1995 assert(type.floating);
1996 assert(lp_check_value(type, a));
1997
1998 res = a;
1999 if (type.sign) {
2000 if (arch_rounding_available(type)) {
2001 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2002 }
2003 else {
2004 struct lp_type inttype;
2005 struct lp_build_context intbld;
2006 LLVMValueRef trunc, itrunc, mask;
2007
2008 assert(type.floating);
2009 assert(lp_check_value(type, a));
2010
2011 inttype = type;
2012 inttype.floating = 0;
2013 lp_build_context_init(&intbld, bld->gallivm, inttype);
2014
2015 /* round by truncation */
2016 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2017 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2018
2019 /*
2020 * fix values if rounding is wrong (for non-special cases)
2021 * - this is the case if trunc > a
2022 * The results of doing this with NaNs, very large values etc.
2023 * are undefined but this seems to be the case anyway.
2024 */
2025 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2026 /* cheapie minus one with mask since the mask is minus one / zero */
2027 return lp_build_add(&intbld, itrunc, mask);
2028 }
2029 }
2030
2031 /* round to nearest (toward zero) */
2032 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2033
2034 return res;
2035 }
2036
2037
2038 /**
2039 * Return ceiling of float (vector), returning int (vector).
2040 * Ex: iceil( 1.1) = 2
2041 * Ex: iceil(-1.1) = -1
2042 */
2043 LLVMValueRef
2044 lp_build_iceil(struct lp_build_context *bld,
2045 LLVMValueRef a)
2046 {
2047 LLVMBuilderRef builder = bld->gallivm->builder;
2048 const struct lp_type type = bld->type;
2049 LLVMTypeRef int_vec_type = bld->int_vec_type;
2050 LLVMValueRef res;
2051
2052 assert(type.floating);
2053 assert(lp_check_value(type, a));
2054
2055 if (arch_rounding_available(type)) {
2056 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2057 }
2058 else {
2059 struct lp_type inttype;
2060 struct lp_build_context intbld;
2061 LLVMValueRef trunc, itrunc, mask;
2062
2063 assert(type.floating);
2064 assert(lp_check_value(type, a));
2065
2066 inttype = type;
2067 inttype.floating = 0;
2068 lp_build_context_init(&intbld, bld->gallivm, inttype);
2069
2070 /* round by truncation */
2071 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2072 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2073
2074 /*
2075 * fix values if rounding is wrong (for non-special cases)
2076 * - this is the case if trunc < a
2077 * The results of doing this with NaNs, very large values etc.
2078 * are undefined but this seems to be the case anyway.
2079 */
2080 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2081 /* cheapie plus one with mask since the mask is minus one / zero */
2082 return lp_build_sub(&intbld, itrunc, mask);
2083 }
2084
2085 /* round to nearest (toward zero) */
2086 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2087
2088 return res;
2089 }
2090
2091
2092 /**
2093 * Combined ifloor() & fract().
2094 *
2095 * Preferred to calling the functions separately, as it will ensure that the
2096 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2097 */
2098 void
2099 lp_build_ifloor_fract(struct lp_build_context *bld,
2100 LLVMValueRef a,
2101 LLVMValueRef *out_ipart,
2102 LLVMValueRef *out_fpart)
2103 {
2104 LLVMBuilderRef builder = bld->gallivm->builder;
2105 const struct lp_type type = bld->type;
2106 LLVMValueRef ipart;
2107
2108 assert(type.floating);
2109 assert(lp_check_value(type, a));
2110
2111 if (arch_rounding_available(type)) {
2112 /*
2113 * floor() is easier.
2114 */
2115
2116 ipart = lp_build_floor(bld, a);
2117 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2118 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2119 }
2120 else {
2121 /*
2122 * ifloor() is easier.
2123 */
2124
2125 *out_ipart = lp_build_ifloor(bld, a);
2126 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2127 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2128 }
2129 }
2130
2131
2132 /**
2133 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2134 * always smaller than one.
2135 */
2136 void
2137 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2138 LLVMValueRef a,
2139 LLVMValueRef *out_ipart,
2140 LLVMValueRef *out_fpart)
2141 {
2142 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2143 *out_fpart = clamp_fract(bld, *out_fpart);
2144 }
2145
2146
2147 LLVMValueRef
2148 lp_build_sqrt(struct lp_build_context *bld,
2149 LLVMValueRef a)
2150 {
2151 LLVMBuilderRef builder = bld->gallivm->builder;
2152 const struct lp_type type = bld->type;
2153 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2154 char intrinsic[32];
2155
2156 assert(lp_check_value(type, a));
2157
2158 /* TODO: optimize the constant case */
2159
2160 assert(type.floating);
2161 if (type.length == 1) {
2162 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2163 }
2164 else {
2165 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2166 }
2167
2168 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2169 }
2170
2171
2172 /**
2173 * Do one Newton-Raphson step to improve reciprocate precision:
2174 *
2175 * x_{i+1} = x_i * (2 - a * x_i)
2176 *
2177 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2178 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2179 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2180 * halo. It would be necessary to clamp the argument to prevent this.
2181 *
2182 * See also:
2183 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2184 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2185 */
2186 static INLINE LLVMValueRef
2187 lp_build_rcp_refine(struct lp_build_context *bld,
2188 LLVMValueRef a,
2189 LLVMValueRef rcp_a)
2190 {
2191 LLVMBuilderRef builder = bld->gallivm->builder;
2192 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2193 LLVMValueRef res;
2194
2195 res = LLVMBuildFMul(builder, a, rcp_a, "");
2196 res = LLVMBuildFSub(builder, two, res, "");
2197 res = LLVMBuildFMul(builder, rcp_a, res, "");
2198
2199 return res;
2200 }
2201
2202
2203 LLVMValueRef
2204 lp_build_rcp(struct lp_build_context *bld,
2205 LLVMValueRef a)
2206 {
2207 LLVMBuilderRef builder = bld->gallivm->builder;
2208 const struct lp_type type = bld->type;
2209
2210 assert(lp_check_value(type, a));
2211
2212 if(a == bld->zero)
2213 return bld->undef;
2214 if(a == bld->one)
2215 return bld->one;
2216 if(a == bld->undef)
2217 return bld->undef;
2218
2219 assert(type.floating);
2220
2221 if(LLVMIsConstant(a))
2222 return LLVMConstFDiv(bld->one, a);
2223
2224 /*
2225 * We don't use RCPPS because:
2226 * - it only has 10bits of precision
2227 * - it doesn't even get the reciprocate of 1.0 exactly
2228 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2229 * - for recent processors the benefit over DIVPS is marginal, a case
2230 * dependent
2231 *
2232 * We could still use it on certain processors if benchmarks show that the
2233 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2234 * particular uses that require less workarounds.
2235 */
2236
2237 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2238 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2239 const unsigned num_iterations = 0;
2240 LLVMValueRef res;
2241 unsigned i;
2242 const char *intrinsic = NULL;
2243
2244 if (type.length == 4) {
2245 intrinsic = "llvm.x86.sse.rcp.ps";
2246 }
2247 else {
2248 intrinsic = "llvm.x86.avx.rcp.ps.256";
2249 }
2250
2251 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2252
2253 for (i = 0; i < num_iterations; ++i) {
2254 res = lp_build_rcp_refine(bld, a, res);
2255 }
2256
2257 return res;
2258 }
2259
2260 return LLVMBuildFDiv(builder, bld->one, a, "");
2261 }
2262
2263
2264 /**
2265 * Do one Newton-Raphson step to improve rsqrt precision:
2266 *
2267 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2268 *
2269 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2270 */
2271 static INLINE LLVMValueRef
2272 lp_build_rsqrt_refine(struct lp_build_context *bld,
2273 LLVMValueRef a,
2274 LLVMValueRef rsqrt_a)
2275 {
2276 LLVMBuilderRef builder = bld->gallivm->builder;
2277 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2278 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2279 LLVMValueRef res;
2280
2281 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2282 res = LLVMBuildFMul(builder, a, res, "");
2283 res = LLVMBuildFSub(builder, three, res, "");
2284 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2285 res = LLVMBuildFMul(builder, half, res, "");
2286
2287 return res;
2288 }
2289
2290
2291 /**
2292 * Generate 1/sqrt(a).
2293 * Result is undefined for values < 0, infinity for +0.
2294 */
2295 LLVMValueRef
2296 lp_build_rsqrt(struct lp_build_context *bld,
2297 LLVMValueRef a)
2298 {
2299 LLVMBuilderRef builder = bld->gallivm->builder;
2300 const struct lp_type type = bld->type;
2301
2302 assert(lp_check_value(type, a));
2303
2304 assert(type.floating);
2305
2306 /*
2307 * This should be faster but all denormals will end up as infinity.
2308 */
2309 if (0 && lp_build_fast_rsqrt_available(type)) {
2310 const unsigned num_iterations = 1;
2311 LLVMValueRef res;
2312 unsigned i;
2313
2314 /* rsqrt(1.0) != 1.0 here */
2315 res = lp_build_fast_rsqrt(bld, a);
2316
2317 if (num_iterations) {
2318 /*
2319 * Newton-Raphson will result in NaN instead of infinity for zero,
2320 * and NaN instead of zero for infinity.
2321 * Also, need to ensure rsqrt(1.0) == 1.0.
2322 * All numbers smaller than FLT_MIN will result in +infinity
2323 * (rsqrtps treats all denormals as zero).
2324 */
2325 /*
2326 * Certain non-c99 compilers don't know INFINITY and might not support
2327 * hacks to evaluate it at compile time neither.
2328 */
2329 const unsigned posinf_int = 0x7F800000;
2330 LLVMValueRef cmp;
2331 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2332 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2333
2334 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2335
2336 for (i = 0; i < num_iterations; ++i) {
2337 res = lp_build_rsqrt_refine(bld, a, res);
2338 }
2339 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2340 res = lp_build_select(bld, cmp, inf, res);
2341 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2342 res = lp_build_select(bld, cmp, bld->zero, res);
2343 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2344 res = lp_build_select(bld, cmp, bld->one, res);
2345 }
2346
2347 return res;
2348 }
2349
2350 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2351 }
2352
2353 /**
2354 * If there's a fast (inaccurate) rsqrt instruction available
2355 * (caller may want to avoid to call rsqrt_fast if it's not available,
2356 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2357 * unavailable it would result in sqrt/div/mul so obviously
2358 * much better to just call sqrt, skipping both div and mul).
2359 */
2360 boolean
2361 lp_build_fast_rsqrt_available(struct lp_type type)
2362 {
2363 assert(type.floating);
2364
2365 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2366 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2367 return true;
2368 }
2369 return false;
2370 }
2371
2372
2373 /**
2374 * Generate 1/sqrt(a).
2375 * Result is undefined for values < 0, infinity for +0.
2376 * Precision is limited, only ~10 bits guaranteed
2377 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2378 */
2379 LLVMValueRef
2380 lp_build_fast_rsqrt(struct lp_build_context *bld,
2381 LLVMValueRef a)
2382 {
2383 LLVMBuilderRef builder = bld->gallivm->builder;
2384 const struct lp_type type = bld->type;
2385
2386 assert(lp_check_value(type, a));
2387
2388 if (lp_build_fast_rsqrt_available(type)) {
2389 const char *intrinsic = NULL;
2390
2391 if (type.length == 4) {
2392 intrinsic = "llvm.x86.sse.rsqrt.ps";
2393 }
2394 else {
2395 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2396 }
2397 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2398 }
2399 else {
2400 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2401 }
2402 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2403 }
2404
2405
2406 /**
2407 * Generate sin(a) using SSE2
2408 */
2409 LLVMValueRef
2410 lp_build_sin(struct lp_build_context *bld,
2411 LLVMValueRef a)
2412 {
2413 struct gallivm_state *gallivm = bld->gallivm;
2414 LLVMBuilderRef builder = gallivm->builder;
2415 struct lp_type int_type = lp_int_type(bld->type);
2416 LLVMBuilderRef b = builder;
2417
2418 /*
2419 * take the absolute value,
2420 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2421 */
2422
2423 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2424 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2425
2426 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2427 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2428
2429 /*
2430 * extract the sign bit (upper one)
2431 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2432 */
2433 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2434 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2435
2436 /*
2437 * scale by 4/Pi
2438 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2439 */
2440
2441 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2442 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2443
2444 /*
2445 * store the integer part of y in mm0
2446 * emm2 = _mm_cvttps_epi32(y);
2447 */
2448
2449 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2450
2451 /*
2452 * j=(j+1) & (~1) (see the cephes sources)
2453 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2454 */
2455
2456 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2457 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2458 /*
2459 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2460 */
2461 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2462 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2463
2464 /*
2465 * y = _mm_cvtepi32_ps(emm2);
2466 */
2467 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2468
2469 /* get the swap sign flag
2470 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2471 */
2472 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2473 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2474
2475 /*
2476 * emm2 = _mm_slli_epi32(emm0, 29);
2477 */
2478 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2479 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2480
2481 /*
2482 * get the polynom selection mask
2483 * there is one polynom for 0 <= x <= Pi/4
2484 * and another one for Pi/4<x<=Pi/2
2485 * Both branches will be computed.
2486 *
2487 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2488 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2489 */
2490
2491 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2492 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2493 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2494 int_type, PIPE_FUNC_EQUAL,
2495 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2496 /*
2497 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2498 */
2499 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2500
2501 /*
2502 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2503 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2504 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2505 */
2506 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2507 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2508 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2509
2510 /*
2511 * The magic pass: "Extended precision modular arithmetic"
2512 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2513 * xmm1 = _mm_mul_ps(y, xmm1);
2514 * xmm2 = _mm_mul_ps(y, xmm2);
2515 * xmm3 = _mm_mul_ps(y, xmm3);
2516 */
2517 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2518 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2519 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2520
2521 /*
2522 * x = _mm_add_ps(x, xmm1);
2523 * x = _mm_add_ps(x, xmm2);
2524 * x = _mm_add_ps(x, xmm3);
2525 */
2526
2527 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2528 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2529 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2530
2531 /*
2532 * Evaluate the first polynom (0 <= x <= Pi/4)
2533 *
2534 * z = _mm_mul_ps(x,x);
2535 */
2536 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2537
2538 /*
2539 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2540 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2541 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2542 */
2543 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2544 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2545 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2546
2547 /*
2548 * y = *(v4sf*)_ps_coscof_p0;
2549 * y = _mm_mul_ps(y, z);
2550 */
2551 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2552 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2553 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2554 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2555 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2556 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2557
2558
2559 /*
2560 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2561 * y = _mm_sub_ps(y, tmp);
2562 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2563 */
2564 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2565 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2566 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2567 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2568 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2569
2570 /*
2571 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2572 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2573 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2574 */
2575 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2576 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2577 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2578
2579 /*
2580 * Evaluate the second polynom (Pi/4 <= x <= 0)
2581 *
2582 * y2 = *(v4sf*)_ps_sincof_p0;
2583 * y2 = _mm_mul_ps(y2, z);
2584 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2585 * y2 = _mm_mul_ps(y2, z);
2586 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2587 * y2 = _mm_mul_ps(y2, z);
2588 * y2 = _mm_mul_ps(y2, x);
2589 * y2 = _mm_add_ps(y2, x);
2590 */
2591
2592 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2593 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2594 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2595 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2596 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2597 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2598 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2599
2600 /*
2601 * select the correct result from the two polynoms
2602 * xmm3 = poly_mask;
2603 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2604 * y = _mm_andnot_ps(xmm3, y);
2605 * y = _mm_or_ps(y,y2);
2606 */
2607 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2608 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2609 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2610 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2611 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2612 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2613
2614 /*
2615 * update the sign
2616 * y = _mm_xor_ps(y, sign_bit);
2617 */
2618 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2619 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2620 return y_result;
2621 }
2622
2623
2624 /**
2625 * Generate cos(a) using SSE2
2626 */
2627 LLVMValueRef
2628 lp_build_cos(struct lp_build_context *bld,
2629 LLVMValueRef a)
2630 {
2631 struct gallivm_state *gallivm = bld->gallivm;
2632 LLVMBuilderRef builder = gallivm->builder;
2633 struct lp_type int_type = lp_int_type(bld->type);
2634 LLVMBuilderRef b = builder;
2635
2636 /*
2637 * take the absolute value,
2638 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2639 */
2640
2641 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2642 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2643
2644 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2645 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2646
2647 /*
2648 * scale by 4/Pi
2649 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2650 */
2651
2652 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2653 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2654
2655 /*
2656 * store the integer part of y in mm0
2657 * emm2 = _mm_cvttps_epi32(y);
2658 */
2659
2660 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2661
2662 /*
2663 * j=(j+1) & (~1) (see the cephes sources)
2664 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2665 */
2666
2667 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2668 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2669 /*
2670 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2671 */
2672 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2673 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2674
2675 /*
2676 * y = _mm_cvtepi32_ps(emm2);
2677 */
2678 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2679
2680
2681 /*
2682 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2683 */
2684 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2685 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2686
2687
2688 /* get the swap sign flag
2689 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2690 */
2691 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2692 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2693 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2694 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2695
2696 /*
2697 * emm2 = _mm_slli_epi32(emm0, 29);
2698 */
2699 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2700 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2701
2702 /*
2703 * get the polynom selection mask
2704 * there is one polynom for 0 <= x <= Pi/4
2705 * and another one for Pi/4<x<=Pi/2
2706 * Both branches will be computed.
2707 *
2708 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2709 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2710 */
2711
2712 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2713 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2714 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2715 int_type, PIPE_FUNC_EQUAL,
2716 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2717
2718 /*
2719 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2720 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2721 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2722 */
2723 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2724 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2725 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2726
2727 /*
2728 * The magic pass: "Extended precision modular arithmetic"
2729 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2730 * xmm1 = _mm_mul_ps(y, xmm1);
2731 * xmm2 = _mm_mul_ps(y, xmm2);
2732 * xmm3 = _mm_mul_ps(y, xmm3);
2733 */
2734 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2735 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2736 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2737
2738 /*
2739 * x = _mm_add_ps(x, xmm1);
2740 * x = _mm_add_ps(x, xmm2);
2741 * x = _mm_add_ps(x, xmm3);
2742 */
2743
2744 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2745 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2746 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2747
2748 /*
2749 * Evaluate the first polynom (0 <= x <= Pi/4)
2750 *
2751 * z = _mm_mul_ps(x,x);
2752 */
2753 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2754
2755 /*
2756 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2757 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2758 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2759 */
2760 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2761 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2762 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2763
2764 /*
2765 * y = *(v4sf*)_ps_coscof_p0;
2766 * y = _mm_mul_ps(y, z);
2767 */
2768 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2769 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2770 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2771 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2772 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2773 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2774
2775
2776 /*
2777 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2778 * y = _mm_sub_ps(y, tmp);
2779 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2780 */
2781 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2782 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2783 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2784 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2785 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2786
2787 /*
2788 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2789 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2790 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2791 */
2792 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2793 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2794 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2795
2796 /*
2797 * Evaluate the second polynom (Pi/4 <= x <= 0)
2798 *
2799 * y2 = *(v4sf*)_ps_sincof_p0;
2800 * y2 = _mm_mul_ps(y2, z);
2801 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2802 * y2 = _mm_mul_ps(y2, z);
2803 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2804 * y2 = _mm_mul_ps(y2, z);
2805 * y2 = _mm_mul_ps(y2, x);
2806 * y2 = _mm_add_ps(y2, x);
2807 */
2808
2809 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2810 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2811 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2812 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2813 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2814 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2815 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2816
2817 /*
2818 * select the correct result from the two polynoms
2819 * xmm3 = poly_mask;
2820 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2821 * y = _mm_andnot_ps(xmm3, y);
2822 * y = _mm_or_ps(y,y2);
2823 */
2824 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2825 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2826 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2827 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
2828 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2829 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
2830
2831 /*
2832 * update the sign
2833 * y = _mm_xor_ps(y, sign_bit);
2834 */
2835 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2836 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2837 return y_result;
2838 }
2839
2840
2841 /**
2842 * Generate pow(x, y)
2843 */
2844 LLVMValueRef
2845 lp_build_pow(struct lp_build_context *bld,
2846 LLVMValueRef x,
2847 LLVMValueRef y)
2848 {
2849 /* TODO: optimize the constant case */
2850 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2851 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2852 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2853 __FUNCTION__);
2854 }
2855
2856 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2857 }
2858
2859
2860 /**
2861 * Generate exp(x)
2862 */
2863 LLVMValueRef
2864 lp_build_exp(struct lp_build_context *bld,
2865 LLVMValueRef x)
2866 {
2867 /* log2(e) = 1/log(2) */
2868 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2869 1.4426950408889634);
2870
2871 assert(lp_check_value(bld->type, x));
2872
2873 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2874 }
2875
2876
2877 /**
2878 * Generate log(x)
2879 */
2880 LLVMValueRef
2881 lp_build_log(struct lp_build_context *bld,
2882 LLVMValueRef x)
2883 {
2884 /* log(2) */
2885 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2886 0.69314718055994529);
2887
2888 assert(lp_check_value(bld->type, x));
2889
2890 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2891 }
2892
2893
2894 /**
2895 * Generate polynomial.
2896 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2897 */
2898 LLVMValueRef
2899 lp_build_polynomial(struct lp_build_context *bld,
2900 LLVMValueRef x,
2901 const double *coeffs,
2902 unsigned num_coeffs)
2903 {
2904 const struct lp_type type = bld->type;
2905 LLVMValueRef even = NULL, odd = NULL;
2906 LLVMValueRef x2;
2907 unsigned i;
2908
2909 assert(lp_check_value(bld->type, x));
2910
2911 /* TODO: optimize the constant case */
2912 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2913 LLVMIsConstant(x)) {
2914 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2915 __FUNCTION__);
2916 }
2917
2918 /*
2919 * Calculate odd and even terms seperately to decrease data dependency
2920 * Ex:
2921 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2922 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2923 */
2924 x2 = lp_build_mul(bld, x, x);
2925
2926 for (i = num_coeffs; i--; ) {
2927 LLVMValueRef coeff;
2928
2929 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2930
2931 if (i % 2 == 0) {
2932 if (even)
2933 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2934 else
2935 even = coeff;
2936 } else {
2937 if (odd)
2938 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2939 else
2940 odd = coeff;
2941 }
2942 }
2943
2944 if (odd)
2945 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2946 else if (even)
2947 return even;
2948 else
2949 return bld->undef;
2950 }
2951
2952
2953 /**
2954 * Minimax polynomial fit of 2**x, in range [0, 1[
2955 */
2956 const double lp_build_exp2_polynomial[] = {
2957 #if EXP_POLY_DEGREE == 5
2958 0.999999925063526176901,
2959 0.693153073200168932794,
2960 0.240153617044375388211,
2961 0.0558263180532956664775,
2962 0.00898934009049466391101,
2963 0.00187757667519147912699
2964 #elif EXP_POLY_DEGREE == 4
2965 1.00000259337069434683,
2966 0.693003834469974940458,
2967 0.24144275689150793076,
2968 0.0520114606103070150235,
2969 0.0135341679161270268764
2970 #elif EXP_POLY_DEGREE == 3
2971 0.999925218562710312959,
2972 0.695833540494823811697,
2973 0.226067155427249155588,
2974 0.0780245226406372992967
2975 #elif EXP_POLY_DEGREE == 2
2976 1.00172476321474503578,
2977 0.657636275736077639316,
2978 0.33718943461968720704
2979 #else
2980 #error
2981 #endif
2982 };
2983
2984
2985 void
2986 lp_build_exp2_approx(struct lp_build_context *bld,
2987 LLVMValueRef x,
2988 LLVMValueRef *p_exp2_int_part,
2989 LLVMValueRef *p_frac_part,
2990 LLVMValueRef *p_exp2)
2991 {
2992 LLVMBuilderRef builder = bld->gallivm->builder;
2993 const struct lp_type type = bld->type;
2994 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2995 LLVMValueRef ipart = NULL;
2996 LLVMValueRef fpart = NULL;
2997 LLVMValueRef expipart = NULL;
2998 LLVMValueRef expfpart = NULL;
2999 LLVMValueRef res = NULL;
3000
3001 assert(lp_check_value(bld->type, x));
3002
3003 if(p_exp2_int_part || p_frac_part || p_exp2) {
3004 /* TODO: optimize the constant case */
3005 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3006 LLVMIsConstant(x)) {
3007 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3008 __FUNCTION__);
3009 }
3010
3011 assert(type.floating && type.width == 32);
3012
3013 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0));
3014 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
3015
3016 /* ipart = floor(x) */
3017 /* fpart = x - ipart */
3018 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3019 }
3020
3021 if(p_exp2_int_part || p_exp2) {
3022 /* expipart = (float) (1 << ipart) */
3023 expipart = LLVMBuildAdd(builder, ipart,
3024 lp_build_const_int_vec(bld->gallivm, type, 127), "");
3025 expipart = LLVMBuildShl(builder, expipart,
3026 lp_build_const_int_vec(bld->gallivm, type, 23), "");
3027 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3028 }
3029
3030 if(p_exp2) {
3031 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3032 Elements(lp_build_exp2_polynomial));
3033
3034 res = LLVMBuildFMul(builder, expipart, expfpart, "");
3035 }
3036
3037 if(p_exp2_int_part)
3038 *p_exp2_int_part = expipart;
3039
3040 if(p_frac_part)
3041 *p_frac_part = fpart;
3042
3043 if(p_exp2)
3044 *p_exp2 = res;
3045 }
3046
3047
3048 LLVMValueRef
3049 lp_build_exp2(struct lp_build_context *bld,
3050 LLVMValueRef x)
3051 {
3052 LLVMValueRef res;
3053 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3054 return res;
3055 }
3056
3057
3058 /**
3059 * Extract the exponent of a IEEE-754 floating point value.
3060 *
3061 * Optionally apply an integer bias.
3062 *
3063 * Result is an integer value with
3064 *
3065 * ifloor(log2(x)) + bias
3066 */
3067 LLVMValueRef
3068 lp_build_extract_exponent(struct lp_build_context *bld,
3069 LLVMValueRef x,
3070 int bias)
3071 {
3072 LLVMBuilderRef builder = bld->gallivm->builder;
3073 const struct lp_type type = bld->type;
3074 unsigned mantissa = lp_mantissa(type);
3075 LLVMValueRef res;
3076
3077 assert(type.floating);
3078
3079 assert(lp_check_value(bld->type, x));
3080
3081 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3082
3083 res = LLVMBuildLShr(builder, x,
3084 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3085 res = LLVMBuildAnd(builder, res,
3086 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3087 res = LLVMBuildSub(builder, res,
3088 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3089
3090 return res;
3091 }
3092
3093
3094 /**
3095 * Extract the mantissa of the a floating.
3096 *
3097 * Result is a floating point value with
3098 *
3099 * x / floor(log2(x))
3100 */
3101 LLVMValueRef
3102 lp_build_extract_mantissa(struct lp_build_context *bld,
3103 LLVMValueRef x)
3104 {
3105 LLVMBuilderRef builder = bld->gallivm->builder;
3106 const struct lp_type type = bld->type;
3107 unsigned mantissa = lp_mantissa(type);
3108 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3109 (1ULL << mantissa) - 1);
3110 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3111 LLVMValueRef res;
3112
3113 assert(lp_check_value(bld->type, x));
3114
3115 assert(type.floating);
3116
3117 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3118
3119 /* res = x / 2**ipart */
3120 res = LLVMBuildAnd(builder, x, mantmask, "");
3121 res = LLVMBuildOr(builder, res, one, "");
3122 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3123
3124 return res;
3125 }
3126
3127
3128
3129 /**
3130 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3131 * These coefficients can be generate with
3132 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3133 */
3134 const double lp_build_log2_polynomial[] = {
3135 #if LOG_POLY_DEGREE == 5
3136 2.88539008148777786488L,
3137 0.961796878841293367824L,
3138 0.577058946784739859012L,
3139 0.412914355135828735411L,
3140 0.308591899232910175289L,
3141 0.352376952300281371868L,
3142 #elif LOG_POLY_DEGREE == 4
3143 2.88539009343309178325L,
3144 0.961791550404184197881L,
3145 0.577440339438736392009L,
3146 0.403343858251329912514L,
3147 0.406718052498846252698L,
3148 #elif LOG_POLY_DEGREE == 3
3149 2.88538959748872753838L,
3150 0.961932915889597772928L,
3151 0.571118517972136195241L,
3152 0.493997535084709500285L,
3153 #else
3154 #error
3155 #endif
3156 };
3157
3158 /**
3159 * See http://www.devmaster.net/forums/showthread.php?p=43580
3160 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3161 * http://www.nezumi.demon.co.uk/consult/logx.htm
3162 */
3163 void
3164 lp_build_log2_approx(struct lp_build_context *bld,
3165 LLVMValueRef x,
3166 LLVMValueRef *p_exp,
3167 LLVMValueRef *p_floor_log2,
3168 LLVMValueRef *p_log2)
3169 {
3170 LLVMBuilderRef builder = bld->gallivm->builder;
3171 const struct lp_type type = bld->type;
3172 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3173 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3174
3175 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3176 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3177 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3178
3179 LLVMValueRef i = NULL;
3180 LLVMValueRef y = NULL;
3181 LLVMValueRef z = NULL;
3182 LLVMValueRef exp = NULL;
3183 LLVMValueRef mant = NULL;
3184 LLVMValueRef logexp = NULL;
3185 LLVMValueRef logmant = NULL;
3186 LLVMValueRef res = NULL;
3187
3188 assert(lp_check_value(bld->type, x));
3189
3190 if(p_exp || p_floor_log2 || p_log2) {
3191 /* TODO: optimize the constant case */
3192 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3193 LLVMIsConstant(x)) {
3194 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3195 __FUNCTION__);
3196 }
3197
3198 assert(type.floating && type.width == 32);
3199
3200 /*
3201 * We don't explicitly handle denormalized numbers. They will yield a
3202 * result in the neighbourhood of -127, which appears to be adequate
3203 * enough.
3204 */
3205
3206 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3207
3208 /* exp = (float) exponent(x) */
3209 exp = LLVMBuildAnd(builder, i, expmask, "");
3210 }
3211
3212 if(p_floor_log2 || p_log2) {
3213 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3214 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3215 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3216 }
3217
3218 if(p_log2) {
3219 /* mant = 1 + (float) mantissa(x) */
3220 mant = LLVMBuildAnd(builder, i, mantmask, "");
3221 mant = LLVMBuildOr(builder, mant, one, "");
3222 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3223
3224 /* y = (mant - 1) / (mant + 1) */
3225 y = lp_build_div(bld,
3226 lp_build_sub(bld, mant, bld->one),
3227 lp_build_add(bld, mant, bld->one)
3228 );
3229
3230 /* z = y^2 */
3231 z = lp_build_mul(bld, y, y);
3232
3233 /* compute P(z) */
3234 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3235 Elements(lp_build_log2_polynomial));
3236
3237 /* logmant = y * P(z) */
3238 logmant = lp_build_mul(bld, y, logmant);
3239
3240 res = lp_build_add(bld, logmant, logexp);
3241 }
3242
3243 if(p_exp) {
3244 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3245 *p_exp = exp;
3246 }
3247
3248 if(p_floor_log2)
3249 *p_floor_log2 = logexp;
3250
3251 if(p_log2)
3252 *p_log2 = res;
3253 }
3254
3255
3256 LLVMValueRef
3257 lp_build_log2(struct lp_build_context *bld,
3258 LLVMValueRef x)
3259 {
3260 LLVMValueRef res;
3261 lp_build_log2_approx(bld, x, NULL, NULL, &res);
3262 return res;
3263 }
3264
3265
3266 /**
3267 * Faster (and less accurate) log2.
3268 *
3269 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3270 *
3271 * Piece-wise linear approximation, with exact results when x is a
3272 * power of two.
3273 *
3274 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3275 */
3276 LLVMValueRef
3277 lp_build_fast_log2(struct lp_build_context *bld,
3278 LLVMValueRef x)
3279 {
3280 LLVMBuilderRef builder = bld->gallivm->builder;
3281 LLVMValueRef ipart;
3282 LLVMValueRef fpart;
3283
3284 assert(lp_check_value(bld->type, x));
3285
3286 assert(bld->type.floating);
3287
3288 /* ipart = floor(log2(x)) - 1 */
3289 ipart = lp_build_extract_exponent(bld, x, -1);
3290 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3291
3292 /* fpart = x / 2**ipart */
3293 fpart = lp_build_extract_mantissa(bld, x);
3294
3295 /* ipart + fpart */
3296 return LLVMBuildFAdd(builder, ipart, fpart, "");
3297 }
3298
3299
3300 /**
3301 * Fast implementation of iround(log2(x)).
3302 *
3303 * Not an approximation -- it should give accurate results all the time.
3304 */
3305 LLVMValueRef
3306 lp_build_ilog2(struct lp_build_context *bld,
3307 LLVMValueRef x)
3308 {
3309 LLVMBuilderRef builder = bld->gallivm->builder;
3310 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3311 LLVMValueRef ipart;
3312
3313 assert(bld->type.floating);
3314
3315 assert(lp_check_value(bld->type, x));
3316
3317 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3318 x = LLVMBuildFMul(builder, x, sqrt2, "");
3319
3320 /* ipart = floor(log2(x) + 0.5) */
3321 ipart = lp_build_extract_exponent(bld, x, 0);
3322
3323 return ipart;
3324 }
3325
3326 LLVMValueRef
3327 lp_build_mod(struct lp_build_context *bld,
3328 LLVMValueRef x,
3329 LLVMValueRef y)
3330 {
3331 LLVMBuilderRef builder = bld->gallivm->builder;
3332 LLVMValueRef res;
3333 const struct lp_type type = bld->type;
3334
3335 assert(lp_check_value(type, x));
3336 assert(lp_check_value(type, y));
3337
3338 if (type.floating)
3339 res = LLVMBuildFRem(builder, x, y, "");
3340 else if (type.sign)
3341 res = LLVMBuildSRem(builder, x, y, "");
3342 else
3343 res = LLVMBuildURem(builder, x, y, "");
3344 return res;
3345 }