st/vdpau: remove vlCreateHTAB from surface functions
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65
66
67 #define EXP_POLY_DEGREE 5
68
69 #define LOG_POLY_DEGREE 4
70
71
72 /**
73 * Generate min(a, b)
74 * No checks for special case values of a or b = 1 or 0 are done.
75 */
76 static LLVMValueRef
77 lp_build_min_simple(struct lp_build_context *bld,
78 LLVMValueRef a,
79 LLVMValueRef b)
80 {
81 const struct lp_type type = bld->type;
82 const char *intrinsic = NULL;
83 unsigned intr_size = 0;
84 LLVMValueRef cond;
85
86 assert(lp_check_value(type, a));
87 assert(lp_check_value(type, b));
88
89 /* TODO: optimize the constant case */
90
91 if (type.floating && util_cpu_caps.has_sse) {
92 if (type.width == 32) {
93 if (type.length == 1) {
94 intrinsic = "llvm.x86.sse.min.ss";
95 intr_size = 128;
96 }
97 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
98 intrinsic = "llvm.x86.sse.min.ps";
99 intr_size = 128;
100 }
101 else {
102 intrinsic = "llvm.x86.avx.min.ps.256";
103 intr_size = 256;
104 }
105 }
106 if (type.width == 64 && util_cpu_caps.has_sse2) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse2.min.sd";
109 intr_size = 128;
110 }
111 else if (type.length == 2 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse2.min.pd";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.pd.256";
117 intr_size = 256;
118 }
119 }
120 }
121 else if (type.floating && util_cpu_caps.has_altivec) {
122 if (type.width == 32 && type.length == 4) {
123 intrinsic = "llvm.ppc.altivec.vminfp";
124 intr_size = 128;
125 }
126 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
127 intr_size = 128;
128 if ((type.width == 8 || type.width == 16) &&
129 (type.width * type.length <= 64) &&
130 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
131 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
132 __FUNCTION__);
133 }
134 if (type.width == 8 && !type.sign) {
135 intrinsic = "llvm.x86.sse2.pminu.b";
136 }
137 else if (type.width == 16 && type.sign) {
138 intrinsic = "llvm.x86.sse2.pmins.w";
139 }
140 if (util_cpu_caps.has_sse4_1) {
141 if (type.width == 8 && type.sign) {
142 intrinsic = "llvm.x86.sse41.pminsb";
143 }
144 if (type.width == 16 && !type.sign) {
145 intrinsic = "llvm.x86.sse41.pminuw";
146 }
147 if (type.width == 32 && !type.sign) {
148 intrinsic = "llvm.x86.sse41.pminud";
149 }
150 if (type.width == 32 && type.sign) {
151 intrinsic = "llvm.x86.sse41.pminsd";
152 }
153 }
154 } else if (util_cpu_caps.has_altivec) {
155 intr_size = 128;
156 if (type.width == 8) {
157 if (!type.sign) {
158 intrinsic = "llvm.ppc.altivec.vminub";
159 } else {
160 intrinsic = "llvm.ppc.altivec.vminsb";
161 }
162 } else if (type.width == 16) {
163 if (!type.sign) {
164 intrinsic = "llvm.ppc.altivec.vminuh";
165 } else {
166 intrinsic = "llvm.ppc.altivec.vminsh";
167 }
168 } else if (type.width == 32) {
169 if (!type.sign) {
170 intrinsic = "llvm.ppc.altivec.vminuw";
171 } else {
172 intrinsic = "llvm.ppc.altivec.vminsw";
173 }
174 }
175 }
176
177 if(intrinsic) {
178 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
179 type,
180 intr_size, a, b);
181 }
182
183 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
184 return lp_build_select(bld, cond, a, b);
185 }
186
187
188 /**
189 * Generate max(a, b)
190 * No checks for special case values of a or b = 1 or 0 are done.
191 */
192 static LLVMValueRef
193 lp_build_max_simple(struct lp_build_context *bld,
194 LLVMValueRef a,
195 LLVMValueRef b)
196 {
197 const struct lp_type type = bld->type;
198 const char *intrinsic = NULL;
199 unsigned intr_size = 0;
200 LLVMValueRef cond;
201
202 assert(lp_check_value(type, a));
203 assert(lp_check_value(type, b));
204
205 /* TODO: optimize the constant case */
206
207 if (type.floating && util_cpu_caps.has_sse) {
208 if (type.width == 32) {
209 if (type.length == 1) {
210 intrinsic = "llvm.x86.sse.max.ss";
211 intr_size = 128;
212 }
213 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
214 intrinsic = "llvm.x86.sse.max.ps";
215 intr_size = 128;
216 }
217 else {
218 intrinsic = "llvm.x86.avx.max.ps.256";
219 intr_size = 256;
220 }
221 }
222 if (type.width == 64 && util_cpu_caps.has_sse2) {
223 if (type.length == 1) {
224 intrinsic = "llvm.x86.sse2.max.sd";
225 intr_size = 128;
226 }
227 else if (type.length == 2 || !util_cpu_caps.has_avx) {
228 intrinsic = "llvm.x86.sse2.max.pd";
229 intr_size = 128;
230 }
231 else {
232 intrinsic = "llvm.x86.avx.max.pd.256";
233 intr_size = 256;
234 }
235 }
236 }
237 else if (type.floating && util_cpu_caps.has_altivec) {
238 if (type.width == 32 || type.length == 4) {
239 intrinsic = "llvm.ppc.altivec.vmaxfp";
240 intr_size = 128;
241 }
242 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
243 intr_size = 128;
244 if ((type.width == 8 || type.width == 16) &&
245 (type.width * type.length <= 64) &&
246 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
247 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
248 __FUNCTION__);
249 }
250 if (type.width == 8 && !type.sign) {
251 intrinsic = "llvm.x86.sse2.pmaxu.b";
252 intr_size = 128;
253 }
254 else if (type.width == 16 && type.sign) {
255 intrinsic = "llvm.x86.sse2.pmaxs.w";
256 }
257 if (util_cpu_caps.has_sse4_1) {
258 if (type.width == 8 && type.sign) {
259 intrinsic = "llvm.x86.sse41.pmaxsb";
260 }
261 if (type.width == 16 && !type.sign) {
262 intrinsic = "llvm.x86.sse41.pmaxuw";
263 }
264 if (type.width == 32 && !type.sign) {
265 intrinsic = "llvm.x86.sse41.pmaxud";
266 }
267 if (type.width == 32 && type.sign) {
268 intrinsic = "llvm.x86.sse41.pmaxsd";
269 }
270 }
271 } else if (util_cpu_caps.has_altivec) {
272 intr_size = 128;
273 if (type.width == 8) {
274 if (!type.sign) {
275 intrinsic = "llvm.ppc.altivec.vmaxub";
276 } else {
277 intrinsic = "llvm.ppc.altivec.vmaxsb";
278 }
279 } else if (type.width == 16) {
280 if (!type.sign) {
281 intrinsic = "llvm.ppc.altivec.vmaxuh";
282 } else {
283 intrinsic = "llvm.ppc.altivec.vmaxsh";
284 }
285 } else if (type.width == 32) {
286 if (!type.sign) {
287 intrinsic = "llvm.ppc.altivec.vmaxuw";
288 } else {
289 intrinsic = "llvm.ppc.altivec.vmaxsw";
290 }
291 }
292 }
293
294 if(intrinsic) {
295 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
296 type,
297 intr_size, a, b);
298 }
299
300 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
301 return lp_build_select(bld, cond, a, b);
302 }
303
304
305 /**
306 * Generate 1 - a, or ~a depending on bld->type.
307 */
308 LLVMValueRef
309 lp_build_comp(struct lp_build_context *bld,
310 LLVMValueRef a)
311 {
312 LLVMBuilderRef builder = bld->gallivm->builder;
313 const struct lp_type type = bld->type;
314
315 assert(lp_check_value(type, a));
316
317 if(a == bld->one)
318 return bld->zero;
319 if(a == bld->zero)
320 return bld->one;
321
322 if(type.norm && !type.floating && !type.fixed && !type.sign) {
323 if(LLVMIsConstant(a))
324 return LLVMConstNot(a);
325 else
326 return LLVMBuildNot(builder, a, "");
327 }
328
329 if(LLVMIsConstant(a))
330 if (type.floating)
331 return LLVMConstFSub(bld->one, a);
332 else
333 return LLVMConstSub(bld->one, a);
334 else
335 if (type.floating)
336 return LLVMBuildFSub(builder, bld->one, a, "");
337 else
338 return LLVMBuildSub(builder, bld->one, a, "");
339 }
340
341
342 /**
343 * Generate a + b
344 */
345 LLVMValueRef
346 lp_build_add(struct lp_build_context *bld,
347 LLVMValueRef a,
348 LLVMValueRef b)
349 {
350 LLVMBuilderRef builder = bld->gallivm->builder;
351 const struct lp_type type = bld->type;
352 LLVMValueRef res;
353
354 assert(lp_check_value(type, a));
355 assert(lp_check_value(type, b));
356
357 if(a == bld->zero)
358 return b;
359 if(b == bld->zero)
360 return a;
361 if(a == bld->undef || b == bld->undef)
362 return bld->undef;
363
364 if(bld->type.norm) {
365 const char *intrinsic = NULL;
366
367 if(a == bld->one || b == bld->one)
368 return bld->one;
369
370 if (type.width * type.length == 128 &&
371 !type.floating && !type.fixed) {
372 if(util_cpu_caps.has_sse2) {
373 if(type.width == 8)
374 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
375 if(type.width == 16)
376 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
377 } else if (util_cpu_caps.has_altivec) {
378 if(type.width == 8)
379 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
380 if(type.width == 16)
381 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
382 }
383 }
384
385 if(intrinsic)
386 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
387 }
388
389 if(LLVMIsConstant(a) && LLVMIsConstant(b))
390 if (type.floating)
391 res = LLVMConstFAdd(a, b);
392 else
393 res = LLVMConstAdd(a, b);
394 else
395 if (type.floating)
396 res = LLVMBuildFAdd(builder, a, b, "");
397 else
398 res = LLVMBuildAdd(builder, a, b, "");
399
400 /* clamp to ceiling of 1.0 */
401 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
402 res = lp_build_min_simple(bld, res, bld->one);
403
404 /* XXX clamp to floor of -1 or 0??? */
405
406 return res;
407 }
408
409
410 /** Return the scalar sum of the elements of a.
411 * Should avoid this operation whenever possible.
412 */
413 LLVMValueRef
414 lp_build_horizontal_add(struct lp_build_context *bld,
415 LLVMValueRef a)
416 {
417 LLVMBuilderRef builder = bld->gallivm->builder;
418 const struct lp_type type = bld->type;
419 LLVMValueRef index, res;
420 unsigned i, length;
421 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
422 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
423 LLVMValueRef vecres, elem2;
424
425 assert(lp_check_value(type, a));
426
427 if (type.length == 1) {
428 return a;
429 }
430
431 assert(!bld->type.norm);
432
433 /*
434 * for byte vectors can do much better with psadbw.
435 * Using repeated shuffle/adds here. Note with multiple vectors
436 * this can be done more efficiently as outlined in the intel
437 * optimization manual.
438 * Note: could cause data rearrangement if used with smaller element
439 * sizes.
440 */
441
442 vecres = a;
443 length = type.length / 2;
444 while (length > 1) {
445 LLVMValueRef vec1, vec2;
446 for (i = 0; i < length; i++) {
447 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
448 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
449 }
450 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
451 LLVMConstVector(shuffles1, length), "");
452 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
453 LLVMConstVector(shuffles2, length), "");
454 if (type.floating) {
455 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
456 }
457 else {
458 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
459 }
460 length = length >> 1;
461 }
462
463 /* always have vector of size 2 here */
464 assert(length == 1);
465
466 index = lp_build_const_int32(bld->gallivm, 0);
467 res = LLVMBuildExtractElement(builder, vecres, index, "");
468 index = lp_build_const_int32(bld->gallivm, 1);
469 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
470
471 if (type.floating)
472 res = LLVMBuildFAdd(builder, res, elem2, "");
473 else
474 res = LLVMBuildAdd(builder, res, elem2, "");
475
476 return res;
477 }
478
479 /**
480 * Return the horizontal sums of 4 float vectors as a float4 vector.
481 * This uses the technique as outlined in Intel Optimization Manual.
482 */
483 static LLVMValueRef
484 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
485 LLVMValueRef src[4])
486 {
487 struct gallivm_state *gallivm = bld->gallivm;
488 LLVMBuilderRef builder = gallivm->builder;
489 LLVMValueRef shuffles[4];
490 LLVMValueRef tmp[4];
491 LLVMValueRef sumtmp[2], shuftmp[2];
492
493 /* lower half of regs */
494 shuffles[0] = lp_build_const_int32(gallivm, 0);
495 shuffles[1] = lp_build_const_int32(gallivm, 1);
496 shuffles[2] = lp_build_const_int32(gallivm, 4);
497 shuffles[3] = lp_build_const_int32(gallivm, 5);
498 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
499 LLVMConstVector(shuffles, 4), "");
500 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
501 LLVMConstVector(shuffles, 4), "");
502
503 /* upper half of regs */
504 shuffles[0] = lp_build_const_int32(gallivm, 2);
505 shuffles[1] = lp_build_const_int32(gallivm, 3);
506 shuffles[2] = lp_build_const_int32(gallivm, 6);
507 shuffles[3] = lp_build_const_int32(gallivm, 7);
508 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
509 LLVMConstVector(shuffles, 4), "");
510 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
511 LLVMConstVector(shuffles, 4), "");
512
513 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
514 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
515
516 shuffles[0] = lp_build_const_int32(gallivm, 0);
517 shuffles[1] = lp_build_const_int32(gallivm, 2);
518 shuffles[2] = lp_build_const_int32(gallivm, 4);
519 shuffles[3] = lp_build_const_int32(gallivm, 6);
520 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
521 LLVMConstVector(shuffles, 4), "");
522
523 shuffles[0] = lp_build_const_int32(gallivm, 1);
524 shuffles[1] = lp_build_const_int32(gallivm, 3);
525 shuffles[2] = lp_build_const_int32(gallivm, 5);
526 shuffles[3] = lp_build_const_int32(gallivm, 7);
527 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
528 LLVMConstVector(shuffles, 4), "");
529
530 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
531 }
532
533
534 /*
535 * partially horizontally add 2-4 float vectors with length nx4,
536 * i.e. only four adjacent values in each vector will be added,
537 * assuming values are really grouped in 4 which also determines
538 * output order.
539 *
540 * Return a vector of the same length as the initial vectors,
541 * with the excess elements (if any) being undefined.
542 * The element order is independent of number of input vectors.
543 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
544 * the output order thus will be
545 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
546 */
547 LLVMValueRef
548 lp_build_hadd_partial4(struct lp_build_context *bld,
549 LLVMValueRef vectors[],
550 unsigned num_vecs)
551 {
552 struct gallivm_state *gallivm = bld->gallivm;
553 LLVMBuilderRef builder = gallivm->builder;
554 LLVMValueRef ret_vec;
555 LLVMValueRef tmp[4];
556 const char *intrinsic = NULL;
557
558 assert(num_vecs >= 2 && num_vecs <= 4);
559 assert(bld->type.floating);
560
561 /* only use this with at least 2 vectors, as it is sort of expensive
562 * (depending on cpu) and we always need two horizontal adds anyway,
563 * so a shuffle/add approach might be better.
564 */
565
566 tmp[0] = vectors[0];
567 tmp[1] = vectors[1];
568
569 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
570 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
571
572 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
573 bld->type.length == 4) {
574 intrinsic = "llvm.x86.sse3.hadd.ps";
575 }
576 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
577 bld->type.length == 8) {
578 intrinsic = "llvm.x86.avx.hadd.ps.256";
579 }
580 if (intrinsic) {
581 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
582 lp_build_vec_type(gallivm, bld->type),
583 tmp[0], tmp[1]);
584 if (num_vecs > 2) {
585 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
586 lp_build_vec_type(gallivm, bld->type),
587 tmp[2], tmp[3]);
588 }
589 else {
590 tmp[1] = tmp[0];
591 }
592 return lp_build_intrinsic_binary(builder, intrinsic,
593 lp_build_vec_type(gallivm, bld->type),
594 tmp[0], tmp[1]);
595 }
596
597 if (bld->type.length == 4) {
598 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
599 }
600 else {
601 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
602 unsigned j;
603 unsigned num_iter = bld->type.length / 4;
604 struct lp_type parttype = bld->type;
605 parttype.length = 4;
606 for (j = 0; j < num_iter; j++) {
607 LLVMValueRef partsrc[4];
608 unsigned i;
609 for (i = 0; i < 4; i++) {
610 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
611 }
612 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
613 }
614 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
615 }
616 return ret_vec;
617 }
618
619 /**
620 * Generate a - b
621 */
622 LLVMValueRef
623 lp_build_sub(struct lp_build_context *bld,
624 LLVMValueRef a,
625 LLVMValueRef b)
626 {
627 LLVMBuilderRef builder = bld->gallivm->builder;
628 const struct lp_type type = bld->type;
629 LLVMValueRef res;
630
631 assert(lp_check_value(type, a));
632 assert(lp_check_value(type, b));
633
634 if(b == bld->zero)
635 return a;
636 if(a == bld->undef || b == bld->undef)
637 return bld->undef;
638 if(a == b)
639 return bld->zero;
640
641 if(bld->type.norm) {
642 const char *intrinsic = NULL;
643
644 if(b == bld->one)
645 return bld->zero;
646
647 if (type.width * type.length == 128 &&
648 !type.floating && !type.fixed) {
649 if (util_cpu_caps.has_sse2) {
650 if(type.width == 8)
651 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
652 if(type.width == 16)
653 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
654 } else if (util_cpu_caps.has_altivec) {
655 if(type.width == 8)
656 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
657 if(type.width == 16)
658 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
659 }
660 }
661
662 if(intrinsic)
663 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
664 }
665
666 if(LLVMIsConstant(a) && LLVMIsConstant(b))
667 if (type.floating)
668 res = LLVMConstFSub(a, b);
669 else
670 res = LLVMConstSub(a, b);
671 else
672 if (type.floating)
673 res = LLVMBuildFSub(builder, a, b, "");
674 else
675 res = LLVMBuildSub(builder, a, b, "");
676
677 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
678 res = lp_build_max_simple(bld, res, bld->zero);
679
680 return res;
681 }
682
683
684
685 /**
686 * Normalized multiplication.
687 *
688 * There are several approaches for (using 8-bit normalized multiplication as
689 * an example):
690 *
691 * - alpha plus one
692 *
693 * makes the following approximation to the division (Sree)
694 *
695 * a*b/255 ~= (a*(b + 1)) >> 256
696 *
697 * which is the fastest method that satisfies the following OpenGL criteria of
698 *
699 * 0*0 = 0 and 255*255 = 255
700 *
701 * - geometric series
702 *
703 * takes the geometric series approximation to the division
704 *
705 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
706 *
707 * in this case just the first two terms to fit in 16bit arithmetic
708 *
709 * t/255 ~= (t + (t >> 8)) >> 8
710 *
711 * note that just by itself it doesn't satisfies the OpenGL criteria, as
712 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
713 * must be used.
714 *
715 * - geometric series plus rounding
716 *
717 * when using a geometric series division instead of truncating the result
718 * use roundoff in the approximation (Jim Blinn)
719 *
720 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
721 *
722 * achieving the exact results.
723 *
724 *
725 *
726 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
727 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
728 * @sa Michael Herf, The "double blend trick", May 2000,
729 * http://www.stereopsis.com/doubleblend.html
730 */
731 static LLVMValueRef
732 lp_build_mul_norm(struct gallivm_state *gallivm,
733 struct lp_type wide_type,
734 LLVMValueRef a, LLVMValueRef b)
735 {
736 LLVMBuilderRef builder = gallivm->builder;
737 struct lp_build_context bld;
738 unsigned n;
739 LLVMValueRef half;
740 LLVMValueRef ab;
741
742 assert(!wide_type.floating);
743 assert(lp_check_value(wide_type, a));
744 assert(lp_check_value(wide_type, b));
745
746 lp_build_context_init(&bld, gallivm, wide_type);
747
748 n = wide_type.width / 2;
749 if (wide_type.sign) {
750 --n;
751 }
752
753 /*
754 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
755 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
756 */
757
758 /*
759 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
760 */
761
762 ab = LLVMBuildMul(builder, a, b, "");
763 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
764
765 /*
766 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
767 */
768
769 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
770 if (wide_type.sign) {
771 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
772 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
773 half = lp_build_select(&bld, sign, minus_half, half);
774 }
775 ab = LLVMBuildAdd(builder, ab, half, "");
776
777 /* Final division */
778 ab = lp_build_shr_imm(&bld, ab, n);
779
780 return ab;
781 }
782
783 /**
784 * Generate a * b
785 */
786 LLVMValueRef
787 lp_build_mul(struct lp_build_context *bld,
788 LLVMValueRef a,
789 LLVMValueRef b)
790 {
791 LLVMBuilderRef builder = bld->gallivm->builder;
792 const struct lp_type type = bld->type;
793 LLVMValueRef shift;
794 LLVMValueRef res;
795
796 assert(lp_check_value(type, a));
797 assert(lp_check_value(type, b));
798
799 if(a == bld->zero)
800 return bld->zero;
801 if(a == bld->one)
802 return b;
803 if(b == bld->zero)
804 return bld->zero;
805 if(b == bld->one)
806 return a;
807 if(a == bld->undef || b == bld->undef)
808 return bld->undef;
809
810 if (!type.floating && !type.fixed && type.norm) {
811 struct lp_type wide_type = lp_wider_type(type);
812 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
813
814 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
815 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
816
817 /* PMULLW, PSRLW, PADDW */
818 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
819 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
820
821 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
822
823 return ab;
824 }
825
826 if(type.fixed)
827 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
828 else
829 shift = NULL;
830
831 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
832 if (type.floating)
833 res = LLVMConstFMul(a, b);
834 else
835 res = LLVMConstMul(a, b);
836 if(shift) {
837 if(type.sign)
838 res = LLVMConstAShr(res, shift);
839 else
840 res = LLVMConstLShr(res, shift);
841 }
842 }
843 else {
844 if (type.floating)
845 res = LLVMBuildFMul(builder, a, b, "");
846 else
847 res = LLVMBuildMul(builder, a, b, "");
848 if(shift) {
849 if(type.sign)
850 res = LLVMBuildAShr(builder, res, shift, "");
851 else
852 res = LLVMBuildLShr(builder, res, shift, "");
853 }
854 }
855
856 return res;
857 }
858
859
860 /**
861 * Small vector x scale multiplication optimization.
862 */
863 LLVMValueRef
864 lp_build_mul_imm(struct lp_build_context *bld,
865 LLVMValueRef a,
866 int b)
867 {
868 LLVMBuilderRef builder = bld->gallivm->builder;
869 LLVMValueRef factor;
870
871 assert(lp_check_value(bld->type, a));
872
873 if(b == 0)
874 return bld->zero;
875
876 if(b == 1)
877 return a;
878
879 if(b == -1)
880 return lp_build_negate(bld, a);
881
882 if(b == 2 && bld->type.floating)
883 return lp_build_add(bld, a, a);
884
885 if(util_is_power_of_two(b)) {
886 unsigned shift = ffs(b) - 1;
887
888 if(bld->type.floating) {
889 #if 0
890 /*
891 * Power of two multiplication by directly manipulating the exponent.
892 *
893 * XXX: This might not be always faster, it will introduce a small error
894 * for multiplication by zero, and it will produce wrong results
895 * for Inf and NaN.
896 */
897 unsigned mantissa = lp_mantissa(bld->type);
898 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
899 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
900 a = LLVMBuildAdd(builder, a, factor, "");
901 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
902 return a;
903 #endif
904 }
905 else {
906 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
907 return LLVMBuildShl(builder, a, factor, "");
908 }
909 }
910
911 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
912 return lp_build_mul(bld, a, factor);
913 }
914
915
916 /**
917 * Generate a / b
918 */
919 LLVMValueRef
920 lp_build_div(struct lp_build_context *bld,
921 LLVMValueRef a,
922 LLVMValueRef b)
923 {
924 LLVMBuilderRef builder = bld->gallivm->builder;
925 const struct lp_type type = bld->type;
926
927 assert(lp_check_value(type, a));
928 assert(lp_check_value(type, b));
929
930 if(a == bld->zero)
931 return bld->zero;
932 if(a == bld->one)
933 return lp_build_rcp(bld, b);
934 if(b == bld->zero)
935 return bld->undef;
936 if(b == bld->one)
937 return a;
938 if(a == bld->undef || b == bld->undef)
939 return bld->undef;
940
941 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
942 if (type.floating)
943 return LLVMConstFDiv(a, b);
944 else if (type.sign)
945 return LLVMConstSDiv(a, b);
946 else
947 return LLVMConstUDiv(a, b);
948 }
949
950 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
951 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
952 type.floating)
953 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
954
955 if (type.floating)
956 return LLVMBuildFDiv(builder, a, b, "");
957 else if (type.sign)
958 return LLVMBuildSDiv(builder, a, b, "");
959 else
960 return LLVMBuildUDiv(builder, a, b, "");
961 }
962
963
964 /**
965 * Linear interpolation helper.
966 *
967 * @param normalized whether we are interpolating normalized values,
968 * encoded in normalized integers, twice as wide.
969 *
970 * @sa http://www.stereopsis.com/doubleblend.html
971 */
972 static INLINE LLVMValueRef
973 lp_build_lerp_simple(struct lp_build_context *bld,
974 LLVMValueRef x,
975 LLVMValueRef v0,
976 LLVMValueRef v1,
977 unsigned flags)
978 {
979 unsigned half_width = bld->type.width/2;
980 LLVMBuilderRef builder = bld->gallivm->builder;
981 LLVMValueRef delta;
982 LLVMValueRef res;
983
984 assert(lp_check_value(bld->type, x));
985 assert(lp_check_value(bld->type, v0));
986 assert(lp_check_value(bld->type, v1));
987
988 delta = lp_build_sub(bld, v1, v0);
989
990 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
991 if (!bld->type.sign) {
992 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
993 /*
994 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
995 * most-significant-bit to the lowest-significant-bit, so that
996 * later we can just divide by 2**n instead of 2**n - 1.
997 */
998
999 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1000 }
1001
1002 /* (x * delta) >> n */
1003 res = lp_build_mul(bld, x, delta);
1004 res = lp_build_shr_imm(bld, res, half_width);
1005 } else {
1006 /*
1007 * The rescaling trick above doesn't work for signed numbers, so
1008 * use the 2**n - 1 divison approximation in lp_build_mul_norm
1009 * instead.
1010 */
1011 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1012 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1013 }
1014 } else {
1015 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1016 res = lp_build_mul(bld, x, delta);
1017 }
1018
1019 res = lp_build_add(bld, v0, res);
1020
1021 if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
1022 bld->type.fixed) {
1023 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1024 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1025 * but it will be wrong for true fixed point use cases. Basically we need
1026 * a more powerful lp_type, capable of further distinguishing the values
1027 * interpretation from the value storage. */
1028 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1029 }
1030
1031 return res;
1032 }
1033
1034
1035 /**
1036 * Linear interpolation.
1037 */
1038 LLVMValueRef
1039 lp_build_lerp(struct lp_build_context *bld,
1040 LLVMValueRef x,
1041 LLVMValueRef v0,
1042 LLVMValueRef v1,
1043 unsigned flags)
1044 {
1045 const struct lp_type type = bld->type;
1046 LLVMValueRef res;
1047
1048 assert(lp_check_value(type, x));
1049 assert(lp_check_value(type, v0));
1050 assert(lp_check_value(type, v1));
1051
1052 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1053
1054 if (type.norm) {
1055 struct lp_type wide_type;
1056 struct lp_build_context wide_bld;
1057 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1058
1059 assert(type.length >= 2);
1060
1061 /*
1062 * Create a wider integer type, enough to hold the
1063 * intermediate result of the multiplication.
1064 */
1065 memset(&wide_type, 0, sizeof wide_type);
1066 wide_type.sign = type.sign;
1067 wide_type.width = type.width*2;
1068 wide_type.length = type.length/2;
1069
1070 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1071
1072 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1073 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1074 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1075
1076 /*
1077 * Lerp both halves.
1078 */
1079
1080 flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1081
1082 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1083 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1084
1085 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1086 } else {
1087 res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1088 }
1089
1090 return res;
1091 }
1092
1093
1094 /**
1095 * Bilinear interpolation.
1096 *
1097 * Values indices are in v_{yx}.
1098 */
1099 LLVMValueRef
1100 lp_build_lerp_2d(struct lp_build_context *bld,
1101 LLVMValueRef x,
1102 LLVMValueRef y,
1103 LLVMValueRef v00,
1104 LLVMValueRef v01,
1105 LLVMValueRef v10,
1106 LLVMValueRef v11,
1107 unsigned flags)
1108 {
1109 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1110 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1111 return lp_build_lerp(bld, y, v0, v1, flags);
1112 }
1113
1114
1115 LLVMValueRef
1116 lp_build_lerp_3d(struct lp_build_context *bld,
1117 LLVMValueRef x,
1118 LLVMValueRef y,
1119 LLVMValueRef z,
1120 LLVMValueRef v000,
1121 LLVMValueRef v001,
1122 LLVMValueRef v010,
1123 LLVMValueRef v011,
1124 LLVMValueRef v100,
1125 LLVMValueRef v101,
1126 LLVMValueRef v110,
1127 LLVMValueRef v111,
1128 unsigned flags)
1129 {
1130 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1131 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1132 return lp_build_lerp(bld, z, v0, v1, flags);
1133 }
1134
1135
1136 /**
1137 * Generate min(a, b)
1138 * Do checks for special cases.
1139 */
1140 LLVMValueRef
1141 lp_build_min(struct lp_build_context *bld,
1142 LLVMValueRef a,
1143 LLVMValueRef b)
1144 {
1145 assert(lp_check_value(bld->type, a));
1146 assert(lp_check_value(bld->type, b));
1147
1148 if(a == bld->undef || b == bld->undef)
1149 return bld->undef;
1150
1151 if(a == b)
1152 return a;
1153
1154 if (bld->type.norm) {
1155 if (!bld->type.sign) {
1156 if (a == bld->zero || b == bld->zero) {
1157 return bld->zero;
1158 }
1159 }
1160 if(a == bld->one)
1161 return b;
1162 if(b == bld->one)
1163 return a;
1164 }
1165
1166 return lp_build_min_simple(bld, a, b);
1167 }
1168
1169
1170 /**
1171 * Generate max(a, b)
1172 * Do checks for special cases.
1173 */
1174 LLVMValueRef
1175 lp_build_max(struct lp_build_context *bld,
1176 LLVMValueRef a,
1177 LLVMValueRef b)
1178 {
1179 assert(lp_check_value(bld->type, a));
1180 assert(lp_check_value(bld->type, b));
1181
1182 if(a == bld->undef || b == bld->undef)
1183 return bld->undef;
1184
1185 if(a == b)
1186 return a;
1187
1188 if(bld->type.norm) {
1189 if(a == bld->one || b == bld->one)
1190 return bld->one;
1191 if (!bld->type.sign) {
1192 if (a == bld->zero) {
1193 return b;
1194 }
1195 if (b == bld->zero) {
1196 return a;
1197 }
1198 }
1199 }
1200
1201 return lp_build_max_simple(bld, a, b);
1202 }
1203
1204
1205 /**
1206 * Generate clamp(a, min, max)
1207 * Do checks for special cases.
1208 */
1209 LLVMValueRef
1210 lp_build_clamp(struct lp_build_context *bld,
1211 LLVMValueRef a,
1212 LLVMValueRef min,
1213 LLVMValueRef max)
1214 {
1215 assert(lp_check_value(bld->type, a));
1216 assert(lp_check_value(bld->type, min));
1217 assert(lp_check_value(bld->type, max));
1218
1219 a = lp_build_min(bld, a, max);
1220 a = lp_build_max(bld, a, min);
1221 return a;
1222 }
1223
1224
1225 /**
1226 * Generate abs(a)
1227 */
1228 LLVMValueRef
1229 lp_build_abs(struct lp_build_context *bld,
1230 LLVMValueRef a)
1231 {
1232 LLVMBuilderRef builder = bld->gallivm->builder;
1233 const struct lp_type type = bld->type;
1234 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1235
1236 assert(lp_check_value(type, a));
1237
1238 if(!type.sign)
1239 return a;
1240
1241 if(type.floating) {
1242 /* Mask out the sign bit */
1243 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1244 unsigned long long absMask = ~(1ULL << (type.width - 1));
1245 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1246 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1247 a = LLVMBuildAnd(builder, a, mask, "");
1248 a = LLVMBuildBitCast(builder, a, vec_type, "");
1249 return a;
1250 }
1251
1252 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1253 switch(type.width) {
1254 case 8:
1255 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1256 case 16:
1257 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1258 case 32:
1259 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1260 }
1261 }
1262 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1263 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1264 (type.width == 8 || type.width == 16 || type.width == 32)) {
1265 debug_printf("%s: inefficient code, should split vectors manually\n",
1266 __FUNCTION__);
1267 }
1268
1269 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1270 }
1271
1272
1273 LLVMValueRef
1274 lp_build_negate(struct lp_build_context *bld,
1275 LLVMValueRef a)
1276 {
1277 LLVMBuilderRef builder = bld->gallivm->builder;
1278
1279 assert(lp_check_value(bld->type, a));
1280
1281 #if HAVE_LLVM >= 0x0207
1282 if (bld->type.floating)
1283 a = LLVMBuildFNeg(builder, a, "");
1284 else
1285 #endif
1286 a = LLVMBuildNeg(builder, a, "");
1287
1288 return a;
1289 }
1290
1291
1292 /** Return -1, 0 or +1 depending on the sign of a */
1293 LLVMValueRef
1294 lp_build_sgn(struct lp_build_context *bld,
1295 LLVMValueRef a)
1296 {
1297 LLVMBuilderRef builder = bld->gallivm->builder;
1298 const struct lp_type type = bld->type;
1299 LLVMValueRef cond;
1300 LLVMValueRef res;
1301
1302 assert(lp_check_value(type, a));
1303
1304 /* Handle non-zero case */
1305 if(!type.sign) {
1306 /* if not zero then sign must be positive */
1307 res = bld->one;
1308 }
1309 else if(type.floating) {
1310 LLVMTypeRef vec_type;
1311 LLVMTypeRef int_type;
1312 LLVMValueRef mask;
1313 LLVMValueRef sign;
1314 LLVMValueRef one;
1315 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1316
1317 int_type = lp_build_int_vec_type(bld->gallivm, type);
1318 vec_type = lp_build_vec_type(bld->gallivm, type);
1319 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1320
1321 /* Take the sign bit and add it to 1 constant */
1322 sign = LLVMBuildBitCast(builder, a, int_type, "");
1323 sign = LLVMBuildAnd(builder, sign, mask, "");
1324 one = LLVMConstBitCast(bld->one, int_type);
1325 res = LLVMBuildOr(builder, sign, one, "");
1326 res = LLVMBuildBitCast(builder, res, vec_type, "");
1327 }
1328 else
1329 {
1330 /* signed int/norm/fixed point */
1331 /* could use psign with sse3 and appropriate vectors here */
1332 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1333 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1334 res = lp_build_select(bld, cond, bld->one, minus_one);
1335 }
1336
1337 /* Handle zero */
1338 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1339 res = lp_build_select(bld, cond, bld->zero, res);
1340
1341 return res;
1342 }
1343
1344
1345 /**
1346 * Set the sign of float vector 'a' according to 'sign'.
1347 * If sign==0, return abs(a).
1348 * If sign==1, return -abs(a);
1349 * Other values for sign produce undefined results.
1350 */
1351 LLVMValueRef
1352 lp_build_set_sign(struct lp_build_context *bld,
1353 LLVMValueRef a, LLVMValueRef sign)
1354 {
1355 LLVMBuilderRef builder = bld->gallivm->builder;
1356 const struct lp_type type = bld->type;
1357 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1358 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1359 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1360 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1361 ~((unsigned long long) 1 << (type.width - 1)));
1362 LLVMValueRef val, res;
1363
1364 assert(type.floating);
1365 assert(lp_check_value(type, a));
1366
1367 /* val = reinterpret_cast<int>(a) */
1368 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1369 /* val = val & mask */
1370 val = LLVMBuildAnd(builder, val, mask, "");
1371 /* sign = sign << shift */
1372 sign = LLVMBuildShl(builder, sign, shift, "");
1373 /* res = val | sign */
1374 res = LLVMBuildOr(builder, val, sign, "");
1375 /* res = reinterpret_cast<float>(res) */
1376 res = LLVMBuildBitCast(builder, res, vec_type, "");
1377
1378 return res;
1379 }
1380
1381
1382 /**
1383 * Convert vector of (or scalar) int to vector of (or scalar) float.
1384 */
1385 LLVMValueRef
1386 lp_build_int_to_float(struct lp_build_context *bld,
1387 LLVMValueRef a)
1388 {
1389 LLVMBuilderRef builder = bld->gallivm->builder;
1390 const struct lp_type type = bld->type;
1391 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1392
1393 assert(type.floating);
1394
1395 return LLVMBuildSIToFP(builder, a, vec_type, "");
1396 }
1397
1398 static boolean
1399 arch_rounding_available(const struct lp_type type)
1400 {
1401 if ((util_cpu_caps.has_sse4_1 &&
1402 (type.length == 1 || type.width*type.length == 128)) ||
1403 (util_cpu_caps.has_avx && type.width*type.length == 256))
1404 return TRUE;
1405 else if ((util_cpu_caps.has_altivec &&
1406 (type.width == 32 && type.length == 4)))
1407 return TRUE;
1408
1409 return FALSE;
1410 }
1411
1412 enum lp_build_round_mode
1413 {
1414 LP_BUILD_ROUND_NEAREST = 0,
1415 LP_BUILD_ROUND_FLOOR = 1,
1416 LP_BUILD_ROUND_CEIL = 2,
1417 LP_BUILD_ROUND_TRUNCATE = 3
1418 };
1419
1420 /**
1421 * Helper for SSE4.1's ROUNDxx instructions.
1422 *
1423 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1424 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1425 */
1426 static INLINE LLVMValueRef
1427 lp_build_round_sse41(struct lp_build_context *bld,
1428 LLVMValueRef a,
1429 enum lp_build_round_mode mode)
1430 {
1431 LLVMBuilderRef builder = bld->gallivm->builder;
1432 const struct lp_type type = bld->type;
1433 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1434 const char *intrinsic;
1435 LLVMValueRef res;
1436
1437 assert(type.floating);
1438
1439 assert(lp_check_value(type, a));
1440 assert(util_cpu_caps.has_sse4_1);
1441
1442 if (type.length == 1) {
1443 LLVMTypeRef vec_type;
1444 LLVMValueRef undef;
1445 LLVMValueRef args[3];
1446 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1447
1448 switch(type.width) {
1449 case 32:
1450 intrinsic = "llvm.x86.sse41.round.ss";
1451 break;
1452 case 64:
1453 intrinsic = "llvm.x86.sse41.round.sd";
1454 break;
1455 default:
1456 assert(0);
1457 return bld->undef;
1458 }
1459
1460 vec_type = LLVMVectorType(bld->elem_type, 4);
1461
1462 undef = LLVMGetUndef(vec_type);
1463
1464 args[0] = undef;
1465 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1466 args[2] = LLVMConstInt(i32t, mode, 0);
1467
1468 res = lp_build_intrinsic(builder, intrinsic,
1469 vec_type, args, Elements(args));
1470
1471 res = LLVMBuildExtractElement(builder, res, index0, "");
1472 }
1473 else {
1474 if (type.width * type.length == 128) {
1475 switch(type.width) {
1476 case 32:
1477 intrinsic = "llvm.x86.sse41.round.ps";
1478 break;
1479 case 64:
1480 intrinsic = "llvm.x86.sse41.round.pd";
1481 break;
1482 default:
1483 assert(0);
1484 return bld->undef;
1485 }
1486 }
1487 else {
1488 assert(type.width * type.length == 256);
1489 assert(util_cpu_caps.has_avx);
1490
1491 switch(type.width) {
1492 case 32:
1493 intrinsic = "llvm.x86.avx.round.ps.256";
1494 break;
1495 case 64:
1496 intrinsic = "llvm.x86.avx.round.pd.256";
1497 break;
1498 default:
1499 assert(0);
1500 return bld->undef;
1501 }
1502 }
1503
1504 res = lp_build_intrinsic_binary(builder, intrinsic,
1505 bld->vec_type, a,
1506 LLVMConstInt(i32t, mode, 0));
1507 }
1508
1509 return res;
1510 }
1511
1512
1513 static INLINE LLVMValueRef
1514 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1515 LLVMValueRef a)
1516 {
1517 LLVMBuilderRef builder = bld->gallivm->builder;
1518 const struct lp_type type = bld->type;
1519 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1520 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1521 const char *intrinsic;
1522 LLVMValueRef res;
1523
1524 assert(type.floating);
1525 /* using the double precision conversions is a bit more complicated */
1526 assert(type.width == 32);
1527
1528 assert(lp_check_value(type, a));
1529 assert(util_cpu_caps.has_sse2);
1530
1531 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1532 if (type.length == 1) {
1533 LLVMTypeRef vec_type;
1534 LLVMValueRef undef;
1535 LLVMValueRef arg;
1536 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1537
1538 vec_type = LLVMVectorType(bld->elem_type, 4);
1539
1540 intrinsic = "llvm.x86.sse.cvtss2si";
1541
1542 undef = LLVMGetUndef(vec_type);
1543
1544 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1545
1546 res = lp_build_intrinsic_unary(builder, intrinsic,
1547 ret_type, arg);
1548 }
1549 else {
1550 if (type.width* type.length == 128) {
1551 intrinsic = "llvm.x86.sse2.cvtps2dq";
1552 }
1553 else {
1554 assert(type.width*type.length == 256);
1555 assert(util_cpu_caps.has_avx);
1556
1557 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1558 }
1559 res = lp_build_intrinsic_unary(builder, intrinsic,
1560 ret_type, a);
1561 }
1562
1563 return res;
1564 }
1565
1566
1567 /*
1568 */
1569 static INLINE LLVMValueRef
1570 lp_build_round_altivec(struct lp_build_context *bld,
1571 LLVMValueRef a,
1572 enum lp_build_round_mode mode)
1573 {
1574 LLVMBuilderRef builder = bld->gallivm->builder;
1575 const struct lp_type type = bld->type;
1576 const char *intrinsic = NULL;
1577
1578 assert(type.floating);
1579
1580 assert(lp_check_value(type, a));
1581 assert(util_cpu_caps.has_altivec);
1582
1583 switch (mode) {
1584 case LP_BUILD_ROUND_NEAREST:
1585 intrinsic = "llvm.ppc.altivec.vrfin";
1586 break;
1587 case LP_BUILD_ROUND_FLOOR:
1588 intrinsic = "llvm.ppc.altivec.vrfim";
1589 break;
1590 case LP_BUILD_ROUND_CEIL:
1591 intrinsic = "llvm.ppc.altivec.vrfip";
1592 break;
1593 case LP_BUILD_ROUND_TRUNCATE:
1594 intrinsic = "llvm.ppc.altivec.vrfiz";
1595 break;
1596 }
1597
1598 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1599 }
1600
1601 static INLINE LLVMValueRef
1602 lp_build_round_arch(struct lp_build_context *bld,
1603 LLVMValueRef a,
1604 enum lp_build_round_mode mode)
1605 {
1606 if (util_cpu_caps.has_sse4_1)
1607 return lp_build_round_sse41(bld, a, mode);
1608 else /* (util_cpu_caps.has_altivec) */
1609 return lp_build_round_altivec(bld, a, mode);
1610 }
1611
1612 /**
1613 * Return the integer part of a float (vector) value (== round toward zero).
1614 * The returned value is a float (vector).
1615 * Ex: trunc(-1.5) = -1.0
1616 */
1617 LLVMValueRef
1618 lp_build_trunc(struct lp_build_context *bld,
1619 LLVMValueRef a)
1620 {
1621 LLVMBuilderRef builder = bld->gallivm->builder;
1622 const struct lp_type type = bld->type;
1623
1624 assert(type.floating);
1625 assert(lp_check_value(type, a));
1626
1627 if (arch_rounding_available(type)) {
1628 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1629 }
1630 else {
1631 const struct lp_type type = bld->type;
1632 struct lp_type inttype;
1633 struct lp_build_context intbld;
1634 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1635 LLVMValueRef trunc, res, anosign, mask;
1636 LLVMTypeRef int_vec_type = bld->int_vec_type;
1637 LLVMTypeRef vec_type = bld->vec_type;
1638
1639 assert(type.width == 32); /* might want to handle doubles at some point */
1640
1641 inttype = type;
1642 inttype.floating = 0;
1643 lp_build_context_init(&intbld, bld->gallivm, inttype);
1644
1645 /* round by truncation */
1646 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1647 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1648
1649 /* mask out sign bit */
1650 anosign = lp_build_abs(bld, a);
1651 /*
1652 * mask out all values if anosign > 2^24
1653 * This should work both for large ints (all rounding is no-op for them
1654 * because such floats are always exact) as well as special cases like
1655 * NaNs, Infs (taking advantage of the fact they use max exponent).
1656 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1657 */
1658 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1659 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1660 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1661 return lp_build_select(bld, mask, a, res);
1662 }
1663 }
1664
1665
1666 /**
1667 * Return float (vector) rounded to nearest integer (vector). The returned
1668 * value is a float (vector).
1669 * Ex: round(0.9) = 1.0
1670 * Ex: round(-1.5) = -2.0
1671 */
1672 LLVMValueRef
1673 lp_build_round(struct lp_build_context *bld,
1674 LLVMValueRef a)
1675 {
1676 LLVMBuilderRef builder = bld->gallivm->builder;
1677 const struct lp_type type = bld->type;
1678
1679 assert(type.floating);
1680 assert(lp_check_value(type, a));
1681
1682 if (arch_rounding_available(type)) {
1683 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1684 }
1685 else {
1686 const struct lp_type type = bld->type;
1687 struct lp_type inttype;
1688 struct lp_build_context intbld;
1689 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1690 LLVMValueRef res, anosign, mask;
1691 LLVMTypeRef int_vec_type = bld->int_vec_type;
1692 LLVMTypeRef vec_type = bld->vec_type;
1693
1694 assert(type.width == 32); /* might want to handle doubles at some point */
1695
1696 inttype = type;
1697 inttype.floating = 0;
1698 lp_build_context_init(&intbld, bld->gallivm, inttype);
1699
1700 res = lp_build_iround(bld, a);
1701 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1702
1703 /* mask out sign bit */
1704 anosign = lp_build_abs(bld, a);
1705 /*
1706 * mask out all values if anosign > 2^24
1707 * This should work both for large ints (all rounding is no-op for them
1708 * because such floats are always exact) as well as special cases like
1709 * NaNs, Infs (taking advantage of the fact they use max exponent).
1710 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1711 */
1712 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1713 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1714 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1715 return lp_build_select(bld, mask, a, res);
1716 }
1717 }
1718
1719
1720 /**
1721 * Return floor of float (vector), result is a float (vector)
1722 * Ex: floor(1.1) = 1.0
1723 * Ex: floor(-1.1) = -2.0
1724 */
1725 LLVMValueRef
1726 lp_build_floor(struct lp_build_context *bld,
1727 LLVMValueRef a)
1728 {
1729 LLVMBuilderRef builder = bld->gallivm->builder;
1730 const struct lp_type type = bld->type;
1731
1732 assert(type.floating);
1733 assert(lp_check_value(type, a));
1734
1735 if (arch_rounding_available(type)) {
1736 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1737 }
1738 else {
1739 const struct lp_type type = bld->type;
1740 struct lp_type inttype;
1741 struct lp_build_context intbld;
1742 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1743 LLVMValueRef trunc, res, anosign, mask;
1744 LLVMTypeRef int_vec_type = bld->int_vec_type;
1745 LLVMTypeRef vec_type = bld->vec_type;
1746
1747 assert(type.width == 32); /* might want to handle doubles at some point */
1748
1749 inttype = type;
1750 inttype.floating = 0;
1751 lp_build_context_init(&intbld, bld->gallivm, inttype);
1752
1753 /* round by truncation */
1754 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1755 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
1756
1757 if (type.sign) {
1758 LLVMValueRef tmp;
1759
1760 /*
1761 * fix values if rounding is wrong (for non-special cases)
1762 * - this is the case if trunc > a
1763 */
1764 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
1765 /* tmp = trunc > a ? 1.0 : 0.0 */
1766 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1767 tmp = lp_build_and(&intbld, mask, tmp);
1768 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1769 res = lp_build_sub(bld, res, tmp);
1770 }
1771
1772 /* mask out sign bit */
1773 anosign = lp_build_abs(bld, a);
1774 /*
1775 * mask out all values if anosign > 2^24
1776 * This should work both for large ints (all rounding is no-op for them
1777 * because such floats are always exact) as well as special cases like
1778 * NaNs, Infs (taking advantage of the fact they use max exponent).
1779 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1780 */
1781 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1782 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1783 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1784 return lp_build_select(bld, mask, a, res);
1785 }
1786 }
1787
1788
1789 /**
1790 * Return ceiling of float (vector), returning float (vector).
1791 * Ex: ceil( 1.1) = 2.0
1792 * Ex: ceil(-1.1) = -1.0
1793 */
1794 LLVMValueRef
1795 lp_build_ceil(struct lp_build_context *bld,
1796 LLVMValueRef a)
1797 {
1798 LLVMBuilderRef builder = bld->gallivm->builder;
1799 const struct lp_type type = bld->type;
1800
1801 assert(type.floating);
1802 assert(lp_check_value(type, a));
1803
1804 if (arch_rounding_available(type)) {
1805 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1806 }
1807 else {
1808 const struct lp_type type = bld->type;
1809 struct lp_type inttype;
1810 struct lp_build_context intbld;
1811 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
1812 LLVMValueRef trunc, res, anosign, mask, tmp;
1813 LLVMTypeRef int_vec_type = bld->int_vec_type;
1814 LLVMTypeRef vec_type = bld->vec_type;
1815
1816 assert(type.width == 32); /* might want to handle doubles at some point */
1817
1818 inttype = type;
1819 inttype.floating = 0;
1820 lp_build_context_init(&intbld, bld->gallivm, inttype);
1821
1822 /* round by truncation */
1823 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1824 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
1825
1826 /*
1827 * fix values if rounding is wrong (for non-special cases)
1828 * - this is the case if trunc < a
1829 */
1830 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
1831 /* tmp = trunc < a ? 1.0 : 0.0 */
1832 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
1833 tmp = lp_build_and(&intbld, mask, tmp);
1834 tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
1835 res = lp_build_add(bld, trunc, tmp);
1836
1837 /* mask out sign bit */
1838 anosign = lp_build_abs(bld, a);
1839 /*
1840 * mask out all values if anosign > 2^24
1841 * This should work both for large ints (all rounding is no-op for them
1842 * because such floats are always exact) as well as special cases like
1843 * NaNs, Infs (taking advantage of the fact they use max exponent).
1844 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
1845 */
1846 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
1847 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
1848 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
1849 return lp_build_select(bld, mask, a, res);
1850 }
1851 }
1852
1853
1854 /**
1855 * Return fractional part of 'a' computed as a - floor(a)
1856 * Typically used in texture coord arithmetic.
1857 */
1858 LLVMValueRef
1859 lp_build_fract(struct lp_build_context *bld,
1860 LLVMValueRef a)
1861 {
1862 assert(bld->type.floating);
1863 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1864 }
1865
1866
1867 /**
1868 * Prevent returning a fractional part of 1.0 for very small negative values of
1869 * 'a' by clamping against 0.99999(9).
1870 */
1871 static inline LLVMValueRef
1872 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1873 {
1874 LLVMValueRef max;
1875
1876 /* this is the largest number smaller than 1.0 representable as float */
1877 max = lp_build_const_vec(bld->gallivm, bld->type,
1878 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1879 return lp_build_min(bld, fract, max);
1880 }
1881
1882
1883 /**
1884 * Same as lp_build_fract, but guarantees that the result is always smaller
1885 * than one.
1886 */
1887 LLVMValueRef
1888 lp_build_fract_safe(struct lp_build_context *bld,
1889 LLVMValueRef a)
1890 {
1891 return clamp_fract(bld, lp_build_fract(bld, a));
1892 }
1893
1894
1895 /**
1896 * Return the integer part of a float (vector) value (== round toward zero).
1897 * The returned value is an integer (vector).
1898 * Ex: itrunc(-1.5) = -1
1899 */
1900 LLVMValueRef
1901 lp_build_itrunc(struct lp_build_context *bld,
1902 LLVMValueRef a)
1903 {
1904 LLVMBuilderRef builder = bld->gallivm->builder;
1905 const struct lp_type type = bld->type;
1906 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1907
1908 assert(type.floating);
1909 assert(lp_check_value(type, a));
1910
1911 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1912 }
1913
1914
1915 /**
1916 * Return float (vector) rounded to nearest integer (vector). The returned
1917 * value is an integer (vector).
1918 * Ex: iround(0.9) = 1
1919 * Ex: iround(-1.5) = -2
1920 */
1921 LLVMValueRef
1922 lp_build_iround(struct lp_build_context *bld,
1923 LLVMValueRef a)
1924 {
1925 LLVMBuilderRef builder = bld->gallivm->builder;
1926 const struct lp_type type = bld->type;
1927 LLVMTypeRef int_vec_type = bld->int_vec_type;
1928 LLVMValueRef res;
1929
1930 assert(type.floating);
1931
1932 assert(lp_check_value(type, a));
1933
1934 if ((util_cpu_caps.has_sse2 &&
1935 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1936 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1937 return lp_build_iround_nearest_sse2(bld, a);
1938 }
1939 if (arch_rounding_available(type)) {
1940 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1941 }
1942 else {
1943 LLVMValueRef half;
1944
1945 half = lp_build_const_vec(bld->gallivm, type, 0.5);
1946
1947 if (type.sign) {
1948 LLVMTypeRef vec_type = bld->vec_type;
1949 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1950 (unsigned long long)1 << (type.width - 1));
1951 LLVMValueRef sign;
1952
1953 /* get sign bit */
1954 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1955 sign = LLVMBuildAnd(builder, sign, mask, "");
1956
1957 /* sign * 0.5 */
1958 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1959 half = LLVMBuildOr(builder, sign, half, "");
1960 half = LLVMBuildBitCast(builder, half, vec_type, "");
1961 }
1962
1963 res = LLVMBuildFAdd(builder, a, half, "");
1964 }
1965
1966 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1967
1968 return res;
1969 }
1970
1971
1972 /**
1973 * Return floor of float (vector), result is an int (vector)
1974 * Ex: ifloor(1.1) = 1.0
1975 * Ex: ifloor(-1.1) = -2.0
1976 */
1977 LLVMValueRef
1978 lp_build_ifloor(struct lp_build_context *bld,
1979 LLVMValueRef a)
1980 {
1981 LLVMBuilderRef builder = bld->gallivm->builder;
1982 const struct lp_type type = bld->type;
1983 LLVMTypeRef int_vec_type = bld->int_vec_type;
1984 LLVMValueRef res;
1985
1986 assert(type.floating);
1987 assert(lp_check_value(type, a));
1988
1989 res = a;
1990 if (type.sign) {
1991 if (arch_rounding_available(type)) {
1992 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1993 }
1994 else {
1995 struct lp_type inttype;
1996 struct lp_build_context intbld;
1997 LLVMValueRef trunc, itrunc, mask;
1998
1999 assert(type.floating);
2000 assert(lp_check_value(type, a));
2001
2002 inttype = type;
2003 inttype.floating = 0;
2004 lp_build_context_init(&intbld, bld->gallivm, inttype);
2005
2006 /* round by truncation */
2007 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2008 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2009
2010 /*
2011 * fix values if rounding is wrong (for non-special cases)
2012 * - this is the case if trunc > a
2013 * The results of doing this with NaNs, very large values etc.
2014 * are undefined but this seems to be the case anyway.
2015 */
2016 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2017 /* cheapie minus one with mask since the mask is minus one / zero */
2018 return lp_build_add(&intbld, itrunc, mask);
2019 }
2020 }
2021
2022 /* round to nearest (toward zero) */
2023 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2024
2025 return res;
2026 }
2027
2028
2029 /**
2030 * Return ceiling of float (vector), returning int (vector).
2031 * Ex: iceil( 1.1) = 2
2032 * Ex: iceil(-1.1) = -1
2033 */
2034 LLVMValueRef
2035 lp_build_iceil(struct lp_build_context *bld,
2036 LLVMValueRef a)
2037 {
2038 LLVMBuilderRef builder = bld->gallivm->builder;
2039 const struct lp_type type = bld->type;
2040 LLVMTypeRef int_vec_type = bld->int_vec_type;
2041 LLVMValueRef res;
2042
2043 assert(type.floating);
2044 assert(lp_check_value(type, a));
2045
2046 if (arch_rounding_available(type)) {
2047 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2048 }
2049 else {
2050 struct lp_type inttype;
2051 struct lp_build_context intbld;
2052 LLVMValueRef trunc, itrunc, mask;
2053
2054 assert(type.floating);
2055 assert(lp_check_value(type, a));
2056
2057 inttype = type;
2058 inttype.floating = 0;
2059 lp_build_context_init(&intbld, bld->gallivm, inttype);
2060
2061 /* round by truncation */
2062 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2063 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2064
2065 /*
2066 * fix values if rounding is wrong (for non-special cases)
2067 * - this is the case if trunc < a
2068 * The results of doing this with NaNs, very large values etc.
2069 * are undefined but this seems to be the case anyway.
2070 */
2071 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2072 /* cheapie plus one with mask since the mask is minus one / zero */
2073 return lp_build_sub(&intbld, itrunc, mask);
2074 }
2075
2076 /* round to nearest (toward zero) */
2077 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2078
2079 return res;
2080 }
2081
2082
2083 /**
2084 * Combined ifloor() & fract().
2085 *
2086 * Preferred to calling the functions separately, as it will ensure that the
2087 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2088 */
2089 void
2090 lp_build_ifloor_fract(struct lp_build_context *bld,
2091 LLVMValueRef a,
2092 LLVMValueRef *out_ipart,
2093 LLVMValueRef *out_fpart)
2094 {
2095 LLVMBuilderRef builder = bld->gallivm->builder;
2096 const struct lp_type type = bld->type;
2097 LLVMValueRef ipart;
2098
2099 assert(type.floating);
2100 assert(lp_check_value(type, a));
2101
2102 if (arch_rounding_available(type)) {
2103 /*
2104 * floor() is easier.
2105 */
2106
2107 ipart = lp_build_floor(bld, a);
2108 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2109 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2110 }
2111 else {
2112 /*
2113 * ifloor() is easier.
2114 */
2115
2116 *out_ipart = lp_build_ifloor(bld, a);
2117 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2118 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2119 }
2120 }
2121
2122
2123 /**
2124 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2125 * always smaller than one.
2126 */
2127 void
2128 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2129 LLVMValueRef a,
2130 LLVMValueRef *out_ipart,
2131 LLVMValueRef *out_fpart)
2132 {
2133 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2134 *out_fpart = clamp_fract(bld, *out_fpart);
2135 }
2136
2137
2138 LLVMValueRef
2139 lp_build_sqrt(struct lp_build_context *bld,
2140 LLVMValueRef a)
2141 {
2142 LLVMBuilderRef builder = bld->gallivm->builder;
2143 const struct lp_type type = bld->type;
2144 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2145 char intrinsic[32];
2146
2147 assert(lp_check_value(type, a));
2148
2149 /* TODO: optimize the constant case */
2150
2151 assert(type.floating);
2152 if (type.length == 1) {
2153 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2154 }
2155 else {
2156 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2157 }
2158
2159 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2160 }
2161
2162
2163 /**
2164 * Do one Newton-Raphson step to improve reciprocate precision:
2165 *
2166 * x_{i+1} = x_i * (2 - a * x_i)
2167 *
2168 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2169 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2170 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2171 * halo. It would be necessary to clamp the argument to prevent this.
2172 *
2173 * See also:
2174 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2175 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2176 */
2177 static INLINE LLVMValueRef
2178 lp_build_rcp_refine(struct lp_build_context *bld,
2179 LLVMValueRef a,
2180 LLVMValueRef rcp_a)
2181 {
2182 LLVMBuilderRef builder = bld->gallivm->builder;
2183 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2184 LLVMValueRef res;
2185
2186 res = LLVMBuildFMul(builder, a, rcp_a, "");
2187 res = LLVMBuildFSub(builder, two, res, "");
2188 res = LLVMBuildFMul(builder, rcp_a, res, "");
2189
2190 return res;
2191 }
2192
2193
2194 LLVMValueRef
2195 lp_build_rcp(struct lp_build_context *bld,
2196 LLVMValueRef a)
2197 {
2198 LLVMBuilderRef builder = bld->gallivm->builder;
2199 const struct lp_type type = bld->type;
2200
2201 assert(lp_check_value(type, a));
2202
2203 if(a == bld->zero)
2204 return bld->undef;
2205 if(a == bld->one)
2206 return bld->one;
2207 if(a == bld->undef)
2208 return bld->undef;
2209
2210 assert(type.floating);
2211
2212 if(LLVMIsConstant(a))
2213 return LLVMConstFDiv(bld->one, a);
2214
2215 /*
2216 * We don't use RCPPS because:
2217 * - it only has 10bits of precision
2218 * - it doesn't even get the reciprocate of 1.0 exactly
2219 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2220 * - for recent processors the benefit over DIVPS is marginal, a case
2221 * dependent
2222 *
2223 * We could still use it on certain processors if benchmarks show that the
2224 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2225 * particular uses that require less workarounds.
2226 */
2227
2228 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2229 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2230 const unsigned num_iterations = 0;
2231 LLVMValueRef res;
2232 unsigned i;
2233 const char *intrinsic = NULL;
2234
2235 if (type.length == 4) {
2236 intrinsic = "llvm.x86.sse.rcp.ps";
2237 }
2238 else {
2239 intrinsic = "llvm.x86.avx.rcp.ps.256";
2240 }
2241
2242 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2243
2244 for (i = 0; i < num_iterations; ++i) {
2245 res = lp_build_rcp_refine(bld, a, res);
2246 }
2247
2248 return res;
2249 }
2250
2251 return LLVMBuildFDiv(builder, bld->one, a, "");
2252 }
2253
2254
2255 /**
2256 * Do one Newton-Raphson step to improve rsqrt precision:
2257 *
2258 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2259 *
2260 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2261 */
2262 static INLINE LLVMValueRef
2263 lp_build_rsqrt_refine(struct lp_build_context *bld,
2264 LLVMValueRef a,
2265 LLVMValueRef rsqrt_a)
2266 {
2267 LLVMBuilderRef builder = bld->gallivm->builder;
2268 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2269 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2270 LLVMValueRef res;
2271
2272 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2273 res = LLVMBuildFMul(builder, a, res, "");
2274 res = LLVMBuildFSub(builder, three, res, "");
2275 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2276 res = LLVMBuildFMul(builder, half, res, "");
2277
2278 return res;
2279 }
2280
2281
2282 /**
2283 * Generate 1/sqrt(a).
2284 * Result is undefined for values < 0, infinity for +0.
2285 */
2286 LLVMValueRef
2287 lp_build_rsqrt(struct lp_build_context *bld,
2288 LLVMValueRef a)
2289 {
2290 LLVMBuilderRef builder = bld->gallivm->builder;
2291 const struct lp_type type = bld->type;
2292
2293 assert(lp_check_value(type, a));
2294
2295 assert(type.floating);
2296
2297 /*
2298 * This should be faster but all denormals will end up as infinity.
2299 */
2300 if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2301 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
2302 const unsigned num_iterations = 1;
2303 LLVMValueRef res;
2304 unsigned i;
2305 const char *intrinsic = NULL;
2306
2307 if (type.length == 4) {
2308 intrinsic = "llvm.x86.sse.rsqrt.ps";
2309 }
2310 else {
2311 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2312 }
2313 if (num_iterations) {
2314 /*
2315 * Newton-Raphson will result in NaN instead of infinity for zero,
2316 * and NaN instead of zero for infinity.
2317 * Also, need to ensure rsqrt(1.0) == 1.0.
2318 * All numbers smaller than FLT_MIN will result in +infinity
2319 * (rsqrtps treats all denormals as zero).
2320 */
2321 /*
2322 * Certain non-c99 compilers don't know INFINITY and might not support
2323 * hacks to evaluate it at compile time neither.
2324 */
2325 const unsigned posinf_int = 0x7F800000;
2326 LLVMValueRef cmp;
2327 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2328 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2329
2330 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2331
2332 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2333
2334 for (i = 0; i < num_iterations; ++i) {
2335 res = lp_build_rsqrt_refine(bld, a, res);
2336 }
2337 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2338 res = lp_build_select(bld, cmp, inf, res);
2339 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2340 res = lp_build_select(bld, cmp, bld->zero, res);
2341 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2342 res = lp_build_select(bld, cmp, bld->one, res);
2343 }
2344 else {
2345 /* rsqrt(1.0) != 1.0 here */
2346 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2347
2348 }
2349
2350 return res;
2351 }
2352
2353 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2354 }
2355
2356
2357 /**
2358 * Generate sin(a) using SSE2
2359 */
2360 LLVMValueRef
2361 lp_build_sin(struct lp_build_context *bld,
2362 LLVMValueRef a)
2363 {
2364 struct gallivm_state *gallivm = bld->gallivm;
2365 LLVMBuilderRef builder = gallivm->builder;
2366 struct lp_type int_type = lp_int_type(bld->type);
2367 LLVMBuilderRef b = builder;
2368
2369 /*
2370 * take the absolute value,
2371 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2372 */
2373
2374 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2375 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2376
2377 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2378 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2379
2380 /*
2381 * extract the sign bit (upper one)
2382 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2383 */
2384 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2385 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2386
2387 /*
2388 * scale by 4/Pi
2389 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2390 */
2391
2392 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2393 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2394
2395 /*
2396 * store the integer part of y in mm0
2397 * emm2 = _mm_cvttps_epi32(y);
2398 */
2399
2400 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2401
2402 /*
2403 * j=(j+1) & (~1) (see the cephes sources)
2404 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2405 */
2406
2407 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2408 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2409 /*
2410 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2411 */
2412 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2413 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2414
2415 /*
2416 * y = _mm_cvtepi32_ps(emm2);
2417 */
2418 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2419
2420 /* get the swap sign flag
2421 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2422 */
2423 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2424 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2425
2426 /*
2427 * emm2 = _mm_slli_epi32(emm0, 29);
2428 */
2429 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2430 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2431
2432 /*
2433 * get the polynom selection mask
2434 * there is one polynom for 0 <= x <= Pi/4
2435 * and another one for Pi/4<x<=Pi/2
2436 * Both branches will be computed.
2437 *
2438 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2439 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2440 */
2441
2442 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2443 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2444 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2445 int_type, PIPE_FUNC_EQUAL,
2446 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2447 /*
2448 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2449 */
2450 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2451
2452 /*
2453 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2454 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2455 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2456 */
2457 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2458 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2459 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2460
2461 /*
2462 * The magic pass: "Extended precision modular arithmetic"
2463 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2464 * xmm1 = _mm_mul_ps(y, xmm1);
2465 * xmm2 = _mm_mul_ps(y, xmm2);
2466 * xmm3 = _mm_mul_ps(y, xmm3);
2467 */
2468 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2469 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2470 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2471
2472 /*
2473 * x = _mm_add_ps(x, xmm1);
2474 * x = _mm_add_ps(x, xmm2);
2475 * x = _mm_add_ps(x, xmm3);
2476 */
2477
2478 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2479 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2480 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2481
2482 /*
2483 * Evaluate the first polynom (0 <= x <= Pi/4)
2484 *
2485 * z = _mm_mul_ps(x,x);
2486 */
2487 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2488
2489 /*
2490 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2491 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2492 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2493 */
2494 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2495 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2496 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2497
2498 /*
2499 * y = *(v4sf*)_ps_coscof_p0;
2500 * y = _mm_mul_ps(y, z);
2501 */
2502 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2503 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2504 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2505 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2506 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2507 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2508
2509
2510 /*
2511 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2512 * y = _mm_sub_ps(y, tmp);
2513 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2514 */
2515 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2516 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2517 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2518 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2519 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2520
2521 /*
2522 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2523 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2524 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2525 */
2526 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2527 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2528 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2529
2530 /*
2531 * Evaluate the second polynom (Pi/4 <= x <= 0)
2532 *
2533 * y2 = *(v4sf*)_ps_sincof_p0;
2534 * y2 = _mm_mul_ps(y2, z);
2535 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2536 * y2 = _mm_mul_ps(y2, z);
2537 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2538 * y2 = _mm_mul_ps(y2, z);
2539 * y2 = _mm_mul_ps(y2, x);
2540 * y2 = _mm_add_ps(y2, x);
2541 */
2542
2543 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2544 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2545 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2546 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2547 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2548 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2549 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2550
2551 /*
2552 * select the correct result from the two polynoms
2553 * xmm3 = poly_mask;
2554 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2555 * y = _mm_andnot_ps(xmm3, y);
2556 * y = _mm_add_ps(y,y2);
2557 */
2558 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2559 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2560 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2561 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2562 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2563 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2564 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2565
2566 /*
2567 * update the sign
2568 * y = _mm_xor_ps(y, sign_bit);
2569 */
2570 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2571 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2572 return y_result;
2573 }
2574
2575
2576 /**
2577 * Generate cos(a) using SSE2
2578 */
2579 LLVMValueRef
2580 lp_build_cos(struct lp_build_context *bld,
2581 LLVMValueRef a)
2582 {
2583 struct gallivm_state *gallivm = bld->gallivm;
2584 LLVMBuilderRef builder = gallivm->builder;
2585 struct lp_type int_type = lp_int_type(bld->type);
2586 LLVMBuilderRef b = builder;
2587
2588 /*
2589 * take the absolute value,
2590 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2591 */
2592
2593 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2594 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2595
2596 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2597 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2598
2599 /*
2600 * scale by 4/Pi
2601 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2602 */
2603
2604 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2605 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2606
2607 /*
2608 * store the integer part of y in mm0
2609 * emm2 = _mm_cvttps_epi32(y);
2610 */
2611
2612 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2613
2614 /*
2615 * j=(j+1) & (~1) (see the cephes sources)
2616 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2617 */
2618
2619 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2620 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2621 /*
2622 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2623 */
2624 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2625 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2626
2627 /*
2628 * y = _mm_cvtepi32_ps(emm2);
2629 */
2630 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2631
2632
2633 /*
2634 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2635 */
2636 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2637 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2638
2639
2640 /* get the swap sign flag
2641 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2642 */
2643 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2644 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2645 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2646 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2647
2648 /*
2649 * emm2 = _mm_slli_epi32(emm0, 29);
2650 */
2651 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2652 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2653
2654 /*
2655 * get the polynom selection mask
2656 * there is one polynom for 0 <= x <= Pi/4
2657 * and another one for Pi/4<x<=Pi/2
2658 * Both branches will be computed.
2659 *
2660 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2661 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2662 */
2663
2664 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2665 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2666 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2667 int_type, PIPE_FUNC_EQUAL,
2668 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2669
2670 /*
2671 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2672 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2673 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2674 */
2675 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2676 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2677 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2678
2679 /*
2680 * The magic pass: "Extended precision modular arithmetic"
2681 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2682 * xmm1 = _mm_mul_ps(y, xmm1);
2683 * xmm2 = _mm_mul_ps(y, xmm2);
2684 * xmm3 = _mm_mul_ps(y, xmm3);
2685 */
2686 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2687 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2688 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2689
2690 /*
2691 * x = _mm_add_ps(x, xmm1);
2692 * x = _mm_add_ps(x, xmm2);
2693 * x = _mm_add_ps(x, xmm3);
2694 */
2695
2696 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2697 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2698 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2699
2700 /*
2701 * Evaluate the first polynom (0 <= x <= Pi/4)
2702 *
2703 * z = _mm_mul_ps(x,x);
2704 */
2705 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2706
2707 /*
2708 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2709 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2710 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2711 */
2712 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2713 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2714 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2715
2716 /*
2717 * y = *(v4sf*)_ps_coscof_p0;
2718 * y = _mm_mul_ps(y, z);
2719 */
2720 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2721 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2722 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2723 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2724 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2725 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2726
2727
2728 /*
2729 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2730 * y = _mm_sub_ps(y, tmp);
2731 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2732 */
2733 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2734 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2735 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2736 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2737 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2738
2739 /*
2740 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2741 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2742 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2743 */
2744 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2745 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2746 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2747
2748 /*
2749 * Evaluate the second polynom (Pi/4 <= x <= 0)
2750 *
2751 * y2 = *(v4sf*)_ps_sincof_p0;
2752 * y2 = _mm_mul_ps(y2, z);
2753 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2754 * y2 = _mm_mul_ps(y2, z);
2755 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2756 * y2 = _mm_mul_ps(y2, z);
2757 * y2 = _mm_mul_ps(y2, x);
2758 * y2 = _mm_add_ps(y2, x);
2759 */
2760
2761 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2762 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2763 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2764 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2765 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2766 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2767 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2768
2769 /*
2770 * select the correct result from the two polynoms
2771 * xmm3 = poly_mask;
2772 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2773 * y = _mm_andnot_ps(xmm3, y);
2774 * y = _mm_add_ps(y,y2);
2775 */
2776 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2777 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2778 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2779 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2780 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2781 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2782
2783 /*
2784 * update the sign
2785 * y = _mm_xor_ps(y, sign_bit);
2786 */
2787 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2788 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2789 return y_result;
2790 }
2791
2792
2793 /**
2794 * Generate pow(x, y)
2795 */
2796 LLVMValueRef
2797 lp_build_pow(struct lp_build_context *bld,
2798 LLVMValueRef x,
2799 LLVMValueRef y)
2800 {
2801 /* TODO: optimize the constant case */
2802 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2803 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2804 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2805 __FUNCTION__);
2806 }
2807
2808 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2809 }
2810
2811
2812 /**
2813 * Generate exp(x)
2814 */
2815 LLVMValueRef
2816 lp_build_exp(struct lp_build_context *bld,
2817 LLVMValueRef x)
2818 {
2819 /* log2(e) = 1/log(2) */
2820 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2821 1.4426950408889634);
2822
2823 assert(lp_check_value(bld->type, x));
2824
2825 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2826 }
2827
2828
2829 /**
2830 * Generate log(x)
2831 */
2832 LLVMValueRef
2833 lp_build_log(struct lp_build_context *bld,
2834 LLVMValueRef x)
2835 {
2836 /* log(2) */
2837 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2838 0.69314718055994529);
2839
2840 assert(lp_check_value(bld->type, x));
2841
2842 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2843 }
2844
2845
2846 /**
2847 * Generate polynomial.
2848 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2849 */
2850 static LLVMValueRef
2851 lp_build_polynomial(struct lp_build_context *bld,
2852 LLVMValueRef x,
2853 const double *coeffs,
2854 unsigned num_coeffs)
2855 {
2856 const struct lp_type type = bld->type;
2857 LLVMValueRef even = NULL, odd = NULL;
2858 LLVMValueRef x2;
2859 unsigned i;
2860
2861 assert(lp_check_value(bld->type, x));
2862
2863 /* TODO: optimize the constant case */
2864 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2865 LLVMIsConstant(x)) {
2866 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2867 __FUNCTION__);
2868 }
2869
2870 /*
2871 * Calculate odd and even terms seperately to decrease data dependency
2872 * Ex:
2873 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2874 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2875 */
2876 x2 = lp_build_mul(bld, x, x);
2877
2878 for (i = num_coeffs; i--; ) {
2879 LLVMValueRef coeff;
2880
2881 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2882
2883 if (i % 2 == 0) {
2884 if (even)
2885 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2886 else
2887 even = coeff;
2888 } else {
2889 if (odd)
2890 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2891 else
2892 odd = coeff;
2893 }
2894 }
2895
2896 if (odd)
2897 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2898 else if (even)
2899 return even;
2900 else
2901 return bld->undef;
2902 }
2903
2904
2905 /**
2906 * Minimax polynomial fit of 2**x, in range [0, 1[
2907 */
2908 const double lp_build_exp2_polynomial[] = {
2909 #if EXP_POLY_DEGREE == 5
2910 0.999999925063526176901,
2911 0.693153073200168932794,
2912 0.240153617044375388211,
2913 0.0558263180532956664775,
2914 0.00898934009049466391101,
2915 0.00187757667519147912699
2916 #elif EXP_POLY_DEGREE == 4
2917 1.00000259337069434683,
2918 0.693003834469974940458,
2919 0.24144275689150793076,
2920 0.0520114606103070150235,
2921 0.0135341679161270268764
2922 #elif EXP_POLY_DEGREE == 3
2923 0.999925218562710312959,
2924 0.695833540494823811697,
2925 0.226067155427249155588,
2926 0.0780245226406372992967
2927 #elif EXP_POLY_DEGREE == 2
2928 1.00172476321474503578,
2929 0.657636275736077639316,
2930 0.33718943461968720704
2931 #else
2932 #error
2933 #endif
2934 };
2935
2936
2937 void
2938 lp_build_exp2_approx(struct lp_build_context *bld,
2939 LLVMValueRef x,
2940 LLVMValueRef *p_exp2_int_part,
2941 LLVMValueRef *p_frac_part,
2942 LLVMValueRef *p_exp2)
2943 {
2944 LLVMBuilderRef builder = bld->gallivm->builder;
2945 const struct lp_type type = bld->type;
2946 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2947 LLVMValueRef ipart = NULL;
2948 LLVMValueRef fpart = NULL;
2949 LLVMValueRef expipart = NULL;
2950 LLVMValueRef expfpart = NULL;
2951 LLVMValueRef res = NULL;
2952
2953 assert(lp_check_value(bld->type, x));
2954
2955 if(p_exp2_int_part || p_frac_part || p_exp2) {
2956 /* TODO: optimize the constant case */
2957 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2958 LLVMIsConstant(x)) {
2959 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2960 __FUNCTION__);
2961 }
2962
2963 assert(type.floating && type.width == 32);
2964
2965 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0));
2966 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2967
2968 /* ipart = floor(x) */
2969 /* fpart = x - ipart */
2970 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2971 }
2972
2973 if(p_exp2_int_part || p_exp2) {
2974 /* expipart = (float) (1 << ipart) */
2975 expipart = LLVMBuildAdd(builder, ipart,
2976 lp_build_const_int_vec(bld->gallivm, type, 127), "");
2977 expipart = LLVMBuildShl(builder, expipart,
2978 lp_build_const_int_vec(bld->gallivm, type, 23), "");
2979 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2980 }
2981
2982 if(p_exp2) {
2983 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2984 Elements(lp_build_exp2_polynomial));
2985
2986 res = LLVMBuildFMul(builder, expipart, expfpart, "");
2987 }
2988
2989 if(p_exp2_int_part)
2990 *p_exp2_int_part = expipart;
2991
2992 if(p_frac_part)
2993 *p_frac_part = fpart;
2994
2995 if(p_exp2)
2996 *p_exp2 = res;
2997 }
2998
2999
3000 LLVMValueRef
3001 lp_build_exp2(struct lp_build_context *bld,
3002 LLVMValueRef x)
3003 {
3004 LLVMValueRef res;
3005 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
3006 return res;
3007 }
3008
3009
3010 /**
3011 * Extract the exponent of a IEEE-754 floating point value.
3012 *
3013 * Optionally apply an integer bias.
3014 *
3015 * Result is an integer value with
3016 *
3017 * ifloor(log2(x)) + bias
3018 */
3019 LLVMValueRef
3020 lp_build_extract_exponent(struct lp_build_context *bld,
3021 LLVMValueRef x,
3022 int bias)
3023 {
3024 LLVMBuilderRef builder = bld->gallivm->builder;
3025 const struct lp_type type = bld->type;
3026 unsigned mantissa = lp_mantissa(type);
3027 LLVMValueRef res;
3028
3029 assert(type.floating);
3030
3031 assert(lp_check_value(bld->type, x));
3032
3033 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3034
3035 res = LLVMBuildLShr(builder, x,
3036 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3037 res = LLVMBuildAnd(builder, res,
3038 lp_build_const_int_vec(bld->gallivm, type, 255), "");
3039 res = LLVMBuildSub(builder, res,
3040 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3041
3042 return res;
3043 }
3044
3045
3046 /**
3047 * Extract the mantissa of the a floating.
3048 *
3049 * Result is a floating point value with
3050 *
3051 * x / floor(log2(x))
3052 */
3053 LLVMValueRef
3054 lp_build_extract_mantissa(struct lp_build_context *bld,
3055 LLVMValueRef x)
3056 {
3057 LLVMBuilderRef builder = bld->gallivm->builder;
3058 const struct lp_type type = bld->type;
3059 unsigned mantissa = lp_mantissa(type);
3060 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3061 (1ULL << mantissa) - 1);
3062 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3063 LLVMValueRef res;
3064
3065 assert(lp_check_value(bld->type, x));
3066
3067 assert(type.floating);
3068
3069 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3070
3071 /* res = x / 2**ipart */
3072 res = LLVMBuildAnd(builder, x, mantmask, "");
3073 res = LLVMBuildOr(builder, res, one, "");
3074 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3075
3076 return res;
3077 }
3078
3079
3080
3081 /**
3082 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3083 * These coefficients can be generate with
3084 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3085 */
3086 const double lp_build_log2_polynomial[] = {
3087 #if LOG_POLY_DEGREE == 5
3088 2.88539008148777786488L,
3089 0.961796878841293367824L,
3090 0.577058946784739859012L,
3091 0.412914355135828735411L,
3092 0.308591899232910175289L,
3093 0.352376952300281371868L,
3094 #elif LOG_POLY_DEGREE == 4
3095 2.88539009343309178325L,
3096 0.961791550404184197881L,
3097 0.577440339438736392009L,
3098 0.403343858251329912514L,
3099 0.406718052498846252698L,
3100 #elif LOG_POLY_DEGREE == 3
3101 2.88538959748872753838L,
3102 0.961932915889597772928L,
3103 0.571118517972136195241L,
3104 0.493997535084709500285L,
3105 #else
3106 #error
3107 #endif
3108 };
3109
3110 /**
3111 * See http://www.devmaster.net/forums/showthread.php?p=43580
3112 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3113 * http://www.nezumi.demon.co.uk/consult/logx.htm
3114 */
3115 void
3116 lp_build_log2_approx(struct lp_build_context *bld,
3117 LLVMValueRef x,
3118 LLVMValueRef *p_exp,
3119 LLVMValueRef *p_floor_log2,
3120 LLVMValueRef *p_log2)
3121 {
3122 LLVMBuilderRef builder = bld->gallivm->builder;
3123 const struct lp_type type = bld->type;
3124 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3125 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3126
3127 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3128 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3129 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3130
3131 LLVMValueRef i = NULL;
3132 LLVMValueRef y = NULL;
3133 LLVMValueRef z = NULL;
3134 LLVMValueRef exp = NULL;
3135 LLVMValueRef mant = NULL;
3136 LLVMValueRef logexp = NULL;
3137 LLVMValueRef logmant = NULL;
3138 LLVMValueRef res = NULL;
3139
3140 assert(lp_check_value(bld->type, x));
3141
3142 if(p_exp || p_floor_log2 || p_log2) {
3143 /* TODO: optimize the constant case */
3144 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3145 LLVMIsConstant(x)) {
3146 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3147 __FUNCTION__);
3148 }
3149
3150 assert(type.floating && type.width == 32);
3151
3152 /*
3153 * We don't explicitly handle denormalized numbers. They will yield a
3154 * result in the neighbourhood of -127, which appears to be adequate
3155 * enough.
3156 */
3157
3158 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3159
3160 /* exp = (float) exponent(x) */
3161 exp = LLVMBuildAnd(builder, i, expmask, "");
3162 }
3163
3164 if(p_floor_log2 || p_log2) {
3165 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3166 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3167 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3168 }
3169
3170 if(p_log2) {
3171 /* mant = 1 + (float) mantissa(x) */
3172 mant = LLVMBuildAnd(builder, i, mantmask, "");
3173 mant = LLVMBuildOr(builder, mant, one, "");
3174 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3175
3176 /* y = (mant - 1) / (mant + 1) */
3177 y = lp_build_div(bld,
3178 lp_build_sub(bld, mant, bld->one),
3179 lp_build_add(bld, mant, bld->one)
3180 );
3181
3182 /* z = y^2 */
3183 z = lp_build_mul(bld, y, y);
3184
3185 /* compute P(z) */
3186 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3187 Elements(lp_build_log2_polynomial));
3188
3189 /* logmant = y * P(z) */
3190 logmant = lp_build_mul(bld, y, logmant);
3191
3192 res = lp_build_add(bld, logmant, logexp);
3193 }
3194
3195 if(p_exp) {
3196 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3197 *p_exp = exp;
3198 }
3199
3200 if(p_floor_log2)
3201 *p_floor_log2 = logexp;
3202
3203 if(p_log2)
3204 *p_log2 = res;
3205 }
3206
3207
3208 LLVMValueRef
3209 lp_build_log2(struct lp_build_context *bld,
3210 LLVMValueRef x)
3211 {
3212 LLVMValueRef res;
3213 lp_build_log2_approx(bld, x, NULL, NULL, &res);
3214 return res;
3215 }
3216
3217
3218 /**
3219 * Faster (and less accurate) log2.
3220 *
3221 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3222 *
3223 * Piece-wise linear approximation, with exact results when x is a
3224 * power of two.
3225 *
3226 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3227 */
3228 LLVMValueRef
3229 lp_build_fast_log2(struct lp_build_context *bld,
3230 LLVMValueRef x)
3231 {
3232 LLVMBuilderRef builder = bld->gallivm->builder;
3233 LLVMValueRef ipart;
3234 LLVMValueRef fpart;
3235
3236 assert(lp_check_value(bld->type, x));
3237
3238 assert(bld->type.floating);
3239
3240 /* ipart = floor(log2(x)) - 1 */
3241 ipart = lp_build_extract_exponent(bld, x, -1);
3242 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3243
3244 /* fpart = x / 2**ipart */
3245 fpart = lp_build_extract_mantissa(bld, x);
3246
3247 /* ipart + fpart */
3248 return LLVMBuildFAdd(builder, ipart, fpart, "");
3249 }
3250
3251
3252 /**
3253 * Fast implementation of iround(log2(x)).
3254 *
3255 * Not an approximation -- it should give accurate results all the time.
3256 */
3257 LLVMValueRef
3258 lp_build_ilog2(struct lp_build_context *bld,
3259 LLVMValueRef x)
3260 {
3261 LLVMBuilderRef builder = bld->gallivm->builder;
3262 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3263 LLVMValueRef ipart;
3264
3265 assert(bld->type.floating);
3266
3267 assert(lp_check_value(bld->type, x));
3268
3269 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3270 x = LLVMBuildFMul(builder, x, sqrt2, "");
3271
3272 /* ipart = floor(log2(x) + 0.5) */
3273 ipart = lp_build_extract_exponent(bld, x, 0);
3274
3275 return ipart;
3276 }
3277
3278 LLVMValueRef
3279 lp_build_mod(struct lp_build_context *bld,
3280 LLVMValueRef x,
3281 LLVMValueRef y)
3282 {
3283 LLVMBuilderRef builder = bld->gallivm->builder;
3284 LLVMValueRef res;
3285 const struct lp_type type = bld->type;
3286
3287 assert(lp_check_value(type, x));
3288 assert(lp_check_value(type, y));
3289
3290 if (type.floating)
3291 res = LLVMBuildFRem(builder, x, y, "");
3292 else if (type.sign)
3293 res = LLVMBuildSRem(builder, x, y, "");
3294 else
3295 res = LLVMBuildURem(builder, x, y, "");
3296 return res;
3297 }