8b19ebd6787cf1d37d717323cb84045609ff36b0
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include <float.h>
49
50 #include "util/u_memory.h"
51 #include "util/u_debug.h"
52 #include "util/u_math.h"
53 #include "util/u_string.h"
54 #include "util/u_cpu_detect.h"
55
56 #include "lp_bld_type.h"
57 #include "lp_bld_const.h"
58 #include "lp_bld_init.h"
59 #include "lp_bld_intr.h"
60 #include "lp_bld_logic.h"
61 #include "lp_bld_pack.h"
62 #include "lp_bld_debug.h"
63 #include "lp_bld_bitarit.h"
64 #include "lp_bld_arit.h"
65
66
67 #define EXP_POLY_DEGREE 5
68
69 #define LOG_POLY_DEGREE 4
70
71
72 /**
73 * Generate min(a, b)
74 * No checks for special case values of a or b = 1 or 0 are done.
75 */
76 static LLVMValueRef
77 lp_build_min_simple(struct lp_build_context *bld,
78 LLVMValueRef a,
79 LLVMValueRef b)
80 {
81 const struct lp_type type = bld->type;
82 const char *intrinsic = NULL;
83 unsigned intr_size = 0;
84 LLVMValueRef cond;
85
86 assert(lp_check_value(type, a));
87 assert(lp_check_value(type, b));
88
89 /* TODO: optimize the constant case */
90
91 if (type.floating && util_cpu_caps.has_sse) {
92 if (type.width == 32) {
93 if (type.length == 1) {
94 intrinsic = "llvm.x86.sse.min.ss";
95 intr_size = 128;
96 }
97 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
98 intrinsic = "llvm.x86.sse.min.ps";
99 intr_size = 128;
100 }
101 else {
102 intrinsic = "llvm.x86.avx.min.ps.256";
103 intr_size = 256;
104 }
105 }
106 if (type.width == 64 && util_cpu_caps.has_sse2) {
107 if (type.length == 1) {
108 intrinsic = "llvm.x86.sse2.min.sd";
109 intr_size = 128;
110 }
111 else if (type.length == 2 || !util_cpu_caps.has_avx) {
112 intrinsic = "llvm.x86.sse2.min.pd";
113 intr_size = 128;
114 }
115 else {
116 intrinsic = "llvm.x86.avx.min.pd.256";
117 intr_size = 256;
118 }
119 }
120 }
121 else if (type.floating && util_cpu_caps.has_altivec) {
122 if (type.width == 32 && type.length == 4) {
123 intrinsic = "llvm.ppc.altivec.vminfp";
124 intr_size = 128;
125 }
126 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
127 intr_size = 128;
128 if ((type.width == 8 || type.width == 16) &&
129 (type.width * type.length <= 64) &&
130 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
131 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
132 __FUNCTION__);
133 }
134 if (type.width == 8 && !type.sign) {
135 intrinsic = "llvm.x86.sse2.pminu.b";
136 }
137 else if (type.width == 16 && type.sign) {
138 intrinsic = "llvm.x86.sse2.pmins.w";
139 }
140 if (util_cpu_caps.has_sse4_1) {
141 if (type.width == 8 && type.sign) {
142 intrinsic = "llvm.x86.sse41.pminsb";
143 }
144 if (type.width == 16 && !type.sign) {
145 intrinsic = "llvm.x86.sse41.pminuw";
146 }
147 if (type.width == 32 && !type.sign) {
148 intrinsic = "llvm.x86.sse41.pminud";
149 }
150 if (type.width == 32 && type.sign) {
151 intrinsic = "llvm.x86.sse41.pminsd";
152 }
153 }
154 } else if (util_cpu_caps.has_altivec) {
155 intr_size = 128;
156 if (type.width == 8) {
157 if (!type.sign) {
158 intrinsic = "llvm.ppc.altivec.vminub";
159 } else {
160 intrinsic = "llvm.ppc.altivec.vminsb";
161 }
162 } else if (type.width == 16) {
163 if (!type.sign) {
164 intrinsic = "llvm.ppc.altivec.vminuh";
165 } else {
166 intrinsic = "llvm.ppc.altivec.vminsh";
167 }
168 } else if (type.width == 32) {
169 if (!type.sign) {
170 intrinsic = "llvm.ppc.altivec.vminuw";
171 } else {
172 intrinsic = "llvm.ppc.altivec.vminsw";
173 }
174 }
175 }
176
177 if(intrinsic) {
178 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
179 type,
180 intr_size, a, b);
181 }
182
183 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
184 return lp_build_select(bld, cond, a, b);
185 }
186
187
188 /**
189 * Generate max(a, b)
190 * No checks for special case values of a or b = 1 or 0 are done.
191 */
192 static LLVMValueRef
193 lp_build_max_simple(struct lp_build_context *bld,
194 LLVMValueRef a,
195 LLVMValueRef b)
196 {
197 const struct lp_type type = bld->type;
198 const char *intrinsic = NULL;
199 unsigned intr_size = 0;
200 LLVMValueRef cond;
201
202 assert(lp_check_value(type, a));
203 assert(lp_check_value(type, b));
204
205 /* TODO: optimize the constant case */
206
207 if (type.floating && util_cpu_caps.has_sse) {
208 if (type.width == 32) {
209 if (type.length == 1) {
210 intrinsic = "llvm.x86.sse.max.ss";
211 intr_size = 128;
212 }
213 else if (type.length <= 4 || !util_cpu_caps.has_avx) {
214 intrinsic = "llvm.x86.sse.max.ps";
215 intr_size = 128;
216 }
217 else {
218 intrinsic = "llvm.x86.avx.max.ps.256";
219 intr_size = 256;
220 }
221 }
222 if (type.width == 64 && util_cpu_caps.has_sse2) {
223 if (type.length == 1) {
224 intrinsic = "llvm.x86.sse2.max.sd";
225 intr_size = 128;
226 }
227 else if (type.length == 2 || !util_cpu_caps.has_avx) {
228 intrinsic = "llvm.x86.sse2.max.pd";
229 intr_size = 128;
230 }
231 else {
232 intrinsic = "llvm.x86.avx.max.pd.256";
233 intr_size = 256;
234 }
235 }
236 }
237 else if (type.floating && util_cpu_caps.has_altivec) {
238 if (type.width == 32 || type.length == 4) {
239 intrinsic = "llvm.ppc.altivec.vmaxfp";
240 intr_size = 128;
241 }
242 } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
243 intr_size = 128;
244 if ((type.width == 8 || type.width == 16) &&
245 (type.width * type.length <= 64) &&
246 (gallivm_debug & GALLIVM_DEBUG_PERF)) {
247 debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
248 __FUNCTION__);
249 }
250 if (type.width == 8 && !type.sign) {
251 intrinsic = "llvm.x86.sse2.pmaxu.b";
252 intr_size = 128;
253 }
254 else if (type.width == 16 && type.sign) {
255 intrinsic = "llvm.x86.sse2.pmaxs.w";
256 }
257 if (util_cpu_caps.has_sse4_1) {
258 if (type.width == 8 && type.sign) {
259 intrinsic = "llvm.x86.sse41.pmaxsb";
260 }
261 if (type.width == 16 && !type.sign) {
262 intrinsic = "llvm.x86.sse41.pmaxuw";
263 }
264 if (type.width == 32 && !type.sign) {
265 intrinsic = "llvm.x86.sse41.pmaxud";
266 }
267 if (type.width == 32 && type.sign) {
268 intrinsic = "llvm.x86.sse41.pmaxsd";
269 }
270 }
271 } else if (util_cpu_caps.has_altivec) {
272 intr_size = 128;
273 if (type.width == 8) {
274 if (!type.sign) {
275 intrinsic = "llvm.ppc.altivec.vmaxub";
276 } else {
277 intrinsic = "llvm.ppc.altivec.vmaxsb";
278 }
279 } else if (type.width == 16) {
280 if (!type.sign) {
281 intrinsic = "llvm.ppc.altivec.vmaxuh";
282 } else {
283 intrinsic = "llvm.ppc.altivec.vmaxsh";
284 }
285 } else if (type.width == 32) {
286 if (!type.sign) {
287 intrinsic = "llvm.ppc.altivec.vmaxuw";
288 } else {
289 intrinsic = "llvm.ppc.altivec.vmaxsw";
290 }
291 }
292 }
293
294 if(intrinsic) {
295 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
296 type,
297 intr_size, a, b);
298 }
299
300 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
301 return lp_build_select(bld, cond, a, b);
302 }
303
304
305 /**
306 * Generate 1 - a, or ~a depending on bld->type.
307 */
308 LLVMValueRef
309 lp_build_comp(struct lp_build_context *bld,
310 LLVMValueRef a)
311 {
312 LLVMBuilderRef builder = bld->gallivm->builder;
313 const struct lp_type type = bld->type;
314
315 assert(lp_check_value(type, a));
316
317 if(a == bld->one)
318 return bld->zero;
319 if(a == bld->zero)
320 return bld->one;
321
322 if(type.norm && !type.floating && !type.fixed && !type.sign) {
323 if(LLVMIsConstant(a))
324 return LLVMConstNot(a);
325 else
326 return LLVMBuildNot(builder, a, "");
327 }
328
329 if(LLVMIsConstant(a))
330 if (type.floating)
331 return LLVMConstFSub(bld->one, a);
332 else
333 return LLVMConstSub(bld->one, a);
334 else
335 if (type.floating)
336 return LLVMBuildFSub(builder, bld->one, a, "");
337 else
338 return LLVMBuildSub(builder, bld->one, a, "");
339 }
340
341
342 /**
343 * Generate a + b
344 */
345 LLVMValueRef
346 lp_build_add(struct lp_build_context *bld,
347 LLVMValueRef a,
348 LLVMValueRef b)
349 {
350 LLVMBuilderRef builder = bld->gallivm->builder;
351 const struct lp_type type = bld->type;
352 LLVMValueRef res;
353
354 assert(lp_check_value(type, a));
355 assert(lp_check_value(type, b));
356
357 if(a == bld->zero)
358 return b;
359 if(b == bld->zero)
360 return a;
361 if(a == bld->undef || b == bld->undef)
362 return bld->undef;
363
364 if(bld->type.norm) {
365 const char *intrinsic = NULL;
366
367 if(a == bld->one || b == bld->one)
368 return bld->one;
369
370 if (type.width * type.length == 128 &&
371 !type.floating && !type.fixed) {
372 if(util_cpu_caps.has_sse2) {
373 if(type.width == 8)
374 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
375 if(type.width == 16)
376 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
377 } else if (util_cpu_caps.has_altivec) {
378 if(type.width == 8)
379 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
380 if(type.width == 16)
381 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsws" : "llvm.ppc.altivec.vadduws";
382 }
383 }
384
385 if(intrinsic)
386 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
387 }
388
389 if(LLVMIsConstant(a) && LLVMIsConstant(b))
390 if (type.floating)
391 res = LLVMConstFAdd(a, b);
392 else
393 res = LLVMConstAdd(a, b);
394 else
395 if (type.floating)
396 res = LLVMBuildFAdd(builder, a, b, "");
397 else
398 res = LLVMBuildAdd(builder, a, b, "");
399
400 /* clamp to ceiling of 1.0 */
401 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
402 res = lp_build_min_simple(bld, res, bld->one);
403
404 /* XXX clamp to floor of -1 or 0??? */
405
406 return res;
407 }
408
409
410 /** Return the scalar sum of the elements of a.
411 * Should avoid this operation whenever possible.
412 */
413 LLVMValueRef
414 lp_build_horizontal_add(struct lp_build_context *bld,
415 LLVMValueRef a)
416 {
417 LLVMBuilderRef builder = bld->gallivm->builder;
418 const struct lp_type type = bld->type;
419 LLVMValueRef index, res;
420 unsigned i, length;
421 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
422 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
423 LLVMValueRef vecres, elem2;
424
425 assert(lp_check_value(type, a));
426
427 if (type.length == 1) {
428 return a;
429 }
430
431 assert(!bld->type.norm);
432
433 /*
434 * for byte vectors can do much better with psadbw.
435 * Using repeated shuffle/adds here. Note with multiple vectors
436 * this can be done more efficiently as outlined in the intel
437 * optimization manual.
438 * Note: could cause data rearrangement if used with smaller element
439 * sizes.
440 */
441
442 vecres = a;
443 length = type.length / 2;
444 while (length > 1) {
445 LLVMValueRef vec1, vec2;
446 for (i = 0; i < length; i++) {
447 shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
448 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
449 }
450 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
451 LLVMConstVector(shuffles1, length), "");
452 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
453 LLVMConstVector(shuffles2, length), "");
454 if (type.floating) {
455 vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
456 }
457 else {
458 vecres = LLVMBuildAdd(builder, vec1, vec2, "");
459 }
460 length = length >> 1;
461 }
462
463 /* always have vector of size 2 here */
464 assert(length == 1);
465
466 index = lp_build_const_int32(bld->gallivm, 0);
467 res = LLVMBuildExtractElement(builder, vecres, index, "");
468 index = lp_build_const_int32(bld->gallivm, 1);
469 elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
470
471 if (type.floating)
472 res = LLVMBuildFAdd(builder, res, elem2, "");
473 else
474 res = LLVMBuildAdd(builder, res, elem2, "");
475
476 return res;
477 }
478
479 /**
480 * Return the horizontal sums of 4 float vectors as a float4 vector.
481 * This uses the technique as outlined in Intel Optimization Manual.
482 */
483 static LLVMValueRef
484 lp_build_horizontal_add4x4f(struct lp_build_context *bld,
485 LLVMValueRef src[4])
486 {
487 struct gallivm_state *gallivm = bld->gallivm;
488 LLVMBuilderRef builder = gallivm->builder;
489 LLVMValueRef shuffles[4];
490 LLVMValueRef tmp[4];
491 LLVMValueRef sumtmp[2], shuftmp[2];
492
493 /* lower half of regs */
494 shuffles[0] = lp_build_const_int32(gallivm, 0);
495 shuffles[1] = lp_build_const_int32(gallivm, 1);
496 shuffles[2] = lp_build_const_int32(gallivm, 4);
497 shuffles[3] = lp_build_const_int32(gallivm, 5);
498 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
499 LLVMConstVector(shuffles, 4), "");
500 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
501 LLVMConstVector(shuffles, 4), "");
502
503 /* upper half of regs */
504 shuffles[0] = lp_build_const_int32(gallivm, 2);
505 shuffles[1] = lp_build_const_int32(gallivm, 3);
506 shuffles[2] = lp_build_const_int32(gallivm, 6);
507 shuffles[3] = lp_build_const_int32(gallivm, 7);
508 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
509 LLVMConstVector(shuffles, 4), "");
510 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
511 LLVMConstVector(shuffles, 4), "");
512
513 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
514 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
515
516 shuffles[0] = lp_build_const_int32(gallivm, 0);
517 shuffles[1] = lp_build_const_int32(gallivm, 2);
518 shuffles[2] = lp_build_const_int32(gallivm, 4);
519 shuffles[3] = lp_build_const_int32(gallivm, 6);
520 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
521 LLVMConstVector(shuffles, 4), "");
522
523 shuffles[0] = lp_build_const_int32(gallivm, 1);
524 shuffles[1] = lp_build_const_int32(gallivm, 3);
525 shuffles[2] = lp_build_const_int32(gallivm, 5);
526 shuffles[3] = lp_build_const_int32(gallivm, 7);
527 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
528 LLVMConstVector(shuffles, 4), "");
529
530 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
531 }
532
533
534 /*
535 * partially horizontally add 2-4 float vectors with length nx4,
536 * i.e. only four adjacent values in each vector will be added,
537 * assuming values are really grouped in 4 which also determines
538 * output order.
539 *
540 * Return a vector of the same length as the initial vectors,
541 * with the excess elements (if any) being undefined.
542 * The element order is independent of number of input vectors.
543 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
544 * the output order thus will be
545 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
546 */
547 LLVMValueRef
548 lp_build_hadd_partial4(struct lp_build_context *bld,
549 LLVMValueRef vectors[],
550 unsigned num_vecs)
551 {
552 struct gallivm_state *gallivm = bld->gallivm;
553 LLVMBuilderRef builder = gallivm->builder;
554 LLVMValueRef ret_vec;
555 LLVMValueRef tmp[4];
556 const char *intrinsic = NULL;
557
558 assert(num_vecs >= 2 && num_vecs <= 4);
559 assert(bld->type.floating);
560
561 /* only use this with at least 2 vectors, as it is sort of expensive
562 * (depending on cpu) and we always need two horizontal adds anyway,
563 * so a shuffle/add approach might be better.
564 */
565
566 tmp[0] = vectors[0];
567 tmp[1] = vectors[1];
568
569 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
570 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
571
572 if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
573 bld->type.length == 4) {
574 intrinsic = "llvm.x86.sse3.hadd.ps";
575 }
576 else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
577 bld->type.length == 8) {
578 intrinsic = "llvm.x86.avx.hadd.ps.256";
579 }
580 if (intrinsic) {
581 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
582 lp_build_vec_type(gallivm, bld->type),
583 tmp[0], tmp[1]);
584 if (num_vecs > 2) {
585 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
586 lp_build_vec_type(gallivm, bld->type),
587 tmp[2], tmp[3]);
588 }
589 else {
590 tmp[1] = tmp[0];
591 }
592 return lp_build_intrinsic_binary(builder, intrinsic,
593 lp_build_vec_type(gallivm, bld->type),
594 tmp[0], tmp[1]);
595 }
596
597 if (bld->type.length == 4) {
598 ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
599 }
600 else {
601 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
602 unsigned j;
603 unsigned num_iter = bld->type.length / 4;
604 struct lp_type parttype = bld->type;
605 parttype.length = 4;
606 for (j = 0; j < num_iter; j++) {
607 LLVMValueRef partsrc[4];
608 unsigned i;
609 for (i = 0; i < 4; i++) {
610 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
611 }
612 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
613 }
614 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
615 }
616 return ret_vec;
617 }
618
619 /**
620 * Generate a - b
621 */
622 LLVMValueRef
623 lp_build_sub(struct lp_build_context *bld,
624 LLVMValueRef a,
625 LLVMValueRef b)
626 {
627 LLVMBuilderRef builder = bld->gallivm->builder;
628 const struct lp_type type = bld->type;
629 LLVMValueRef res;
630
631 assert(lp_check_value(type, a));
632 assert(lp_check_value(type, b));
633
634 if(b == bld->zero)
635 return a;
636 if(a == bld->undef || b == bld->undef)
637 return bld->undef;
638 if(a == b)
639 return bld->zero;
640
641 if(bld->type.norm) {
642 const char *intrinsic = NULL;
643
644 if(b == bld->one)
645 return bld->zero;
646
647 if (type.width * type.length == 128 &&
648 !type.floating && !type.fixed) {
649 if (util_cpu_caps.has_sse2) {
650 if(type.width == 8)
651 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
652 if(type.width == 16)
653 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
654 } else if (util_cpu_caps.has_altivec) {
655 if(type.width == 8)
656 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
657 if(type.width == 16)
658 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsws" : "llvm.ppc.altivec.vsubuws";
659 }
660 }
661
662 if(intrinsic)
663 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
664 }
665
666 if(LLVMIsConstant(a) && LLVMIsConstant(b))
667 if (type.floating)
668 res = LLVMConstFSub(a, b);
669 else
670 res = LLVMConstSub(a, b);
671 else
672 if (type.floating)
673 res = LLVMBuildFSub(builder, a, b, "");
674 else
675 res = LLVMBuildSub(builder, a, b, "");
676
677 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
678 res = lp_build_max_simple(bld, res, bld->zero);
679
680 return res;
681 }
682
683
684
685 /**
686 * Normalized multiplication.
687 *
688 * There are several approaches here (using 8-bit normalized multiplication as
689 * an example):
690 *
691 * - alpha plus one
692 *
693 * makes the following approximation to the division (Sree)
694 *
695 * a*b/255 ~= (a*(b + 1)) >> 256
696 *
697 * which is the fastest method that satisfies the following OpenGL criteria
698 *
699 * 0*0 = 0 and 255*255 = 255
700 *
701 * - geometric series
702 *
703 * takes the geometric series approximation to the division
704 *
705 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
706 *
707 * in this case just the first two terms to fit in 16bit arithmetic
708 *
709 * t/255 ~= (t + (t >> 8)) >> 8
710 *
711 * note that just by itself it doesn't satisfies the OpenGL criteria, as
712 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
713 * must be used
714 *
715 * - geometric series plus rounding
716 *
717 * when using a geometric series division instead of truncating the result
718 * use roundoff in the approximation (Jim Blinn)
719 *
720 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
721 *
722 * achieving the exact results
723 *
724 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
725 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
726 * @sa Michael Herf, The "double blend trick", May 2000,
727 * http://www.stereopsis.com/doubleblend.html
728 */
729 static LLVMValueRef
730 lp_build_mul_norm(struct gallivm_state *gallivm,
731 struct lp_type wide_type,
732 LLVMValueRef a, LLVMValueRef b)
733 {
734 LLVMBuilderRef builder = gallivm->builder;
735 struct lp_build_context bld;
736 unsigned bits;
737 LLVMValueRef shift;
738 LLVMValueRef half;
739 LLVMValueRef ab;
740
741 assert(!wide_type.floating);
742 assert(lp_check_value(wide_type, a));
743 assert(lp_check_value(wide_type, b));
744
745 lp_build_context_init(&bld, gallivm, wide_type);
746
747 bits = wide_type.width / 2;
748 if (wide_type.sign) {
749 --bits;
750 }
751
752 shift = lp_build_const_int_vec(gallivm, wide_type, bits);
753
754 #if 0
755
756 /* a*b/255 ~= (a*(b + 1)) >> 256 */
757 /* XXX: This would not work for signed types */
758 assert(!wide_type.sign);
759 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, wide_type, 1), "");
760 ab = LLVMBuildMul(builder, a, b, "");
761
762 #else
763
764 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
765 ab = LLVMBuildMul(builder, a, b, "");
766 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, shift, ""), "");
767
768 /* Add rounding term */
769 half = lp_build_const_int_vec(gallivm, wide_type, 1 << (bits - 1));
770 if (wide_type.sign) {
771 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
772 LLVMValueRef sign = lp_build_shr_imm(&bld, half, wide_type.width - 1);
773 half = lp_build_select(&bld, sign, minus_half, half);
774 }
775 ab = LLVMBuildAdd(builder, ab, half, "");
776
777 #endif
778
779 ab = LLVMBuildLShr(builder, ab, shift, "");
780
781 return ab;
782 }
783
784 /**
785 * Generate a * b
786 */
787 LLVMValueRef
788 lp_build_mul(struct lp_build_context *bld,
789 LLVMValueRef a,
790 LLVMValueRef b)
791 {
792 LLVMBuilderRef builder = bld->gallivm->builder;
793 const struct lp_type type = bld->type;
794 LLVMValueRef shift;
795 LLVMValueRef res;
796
797 assert(lp_check_value(type, a));
798 assert(lp_check_value(type, b));
799
800 if(a == bld->zero)
801 return bld->zero;
802 if(a == bld->one)
803 return b;
804 if(b == bld->zero)
805 return bld->zero;
806 if(b == bld->one)
807 return a;
808 if(a == bld->undef || b == bld->undef)
809 return bld->undef;
810
811 if (!type.floating && !type.fixed && type.norm) {
812 struct lp_type wide_type = lp_wider_type(type);
813 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
814
815 lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
816 lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
817
818 /* PMULLW, PSRLW, PADDW */
819 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
820 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
821
822 ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
823
824 return ab;
825 }
826
827 if(type.fixed)
828 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
829 else
830 shift = NULL;
831
832 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
833 if (type.floating)
834 res = LLVMConstFMul(a, b);
835 else
836 res = LLVMConstMul(a, b);
837 if(shift) {
838 if(type.sign)
839 res = LLVMConstAShr(res, shift);
840 else
841 res = LLVMConstLShr(res, shift);
842 }
843 }
844 else {
845 if (type.floating)
846 res = LLVMBuildFMul(builder, a, b, "");
847 else
848 res = LLVMBuildMul(builder, a, b, "");
849 if(shift) {
850 if(type.sign)
851 res = LLVMBuildAShr(builder, res, shift, "");
852 else
853 res = LLVMBuildLShr(builder, res, shift, "");
854 }
855 }
856
857 return res;
858 }
859
860
861 /**
862 * Small vector x scale multiplication optimization.
863 */
864 LLVMValueRef
865 lp_build_mul_imm(struct lp_build_context *bld,
866 LLVMValueRef a,
867 int b)
868 {
869 LLVMBuilderRef builder = bld->gallivm->builder;
870 LLVMValueRef factor;
871
872 assert(lp_check_value(bld->type, a));
873
874 if(b == 0)
875 return bld->zero;
876
877 if(b == 1)
878 return a;
879
880 if(b == -1)
881 return lp_build_negate(bld, a);
882
883 if(b == 2 && bld->type.floating)
884 return lp_build_add(bld, a, a);
885
886 if(util_is_power_of_two(b)) {
887 unsigned shift = ffs(b) - 1;
888
889 if(bld->type.floating) {
890 #if 0
891 /*
892 * Power of two multiplication by directly manipulating the exponent.
893 *
894 * XXX: This might not be always faster, it will introduce a small error
895 * for multiplication by zero, and it will produce wrong results
896 * for Inf and NaN.
897 */
898 unsigned mantissa = lp_mantissa(bld->type);
899 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
900 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
901 a = LLVMBuildAdd(builder, a, factor, "");
902 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
903 return a;
904 #endif
905 }
906 else {
907 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
908 return LLVMBuildShl(builder, a, factor, "");
909 }
910 }
911
912 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
913 return lp_build_mul(bld, a, factor);
914 }
915
916
917 /**
918 * Generate a / b
919 */
920 LLVMValueRef
921 lp_build_div(struct lp_build_context *bld,
922 LLVMValueRef a,
923 LLVMValueRef b)
924 {
925 LLVMBuilderRef builder = bld->gallivm->builder;
926 const struct lp_type type = bld->type;
927
928 assert(lp_check_value(type, a));
929 assert(lp_check_value(type, b));
930
931 if(a == bld->zero)
932 return bld->zero;
933 if(a == bld->one)
934 return lp_build_rcp(bld, b);
935 if(b == bld->zero)
936 return bld->undef;
937 if(b == bld->one)
938 return a;
939 if(a == bld->undef || b == bld->undef)
940 return bld->undef;
941
942 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
943 if (type.floating)
944 return LLVMConstFDiv(a, b);
945 else if (type.sign)
946 return LLVMConstSDiv(a, b);
947 else
948 return LLVMConstUDiv(a, b);
949 }
950
951 if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
952 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
953 type.floating)
954 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
955
956 if (type.floating)
957 return LLVMBuildFDiv(builder, a, b, "");
958 else if (type.sign)
959 return LLVMBuildSDiv(builder, a, b, "");
960 else
961 return LLVMBuildUDiv(builder, a, b, "");
962 }
963
964
965 /**
966 * Linear interpolation helper.
967 *
968 * @param normalized whether we are interpolating normalized values,
969 * encoded in normalized integers, twice as wide.
970 *
971 * @sa http://www.stereopsis.com/doubleblend.html
972 */
973 static INLINE LLVMValueRef
974 lp_build_lerp_simple(struct lp_build_context *bld,
975 LLVMValueRef x,
976 LLVMValueRef v0,
977 LLVMValueRef v1,
978 bool normalized)
979 {
980 unsigned half_width = bld->type.width/2;
981 LLVMBuilderRef builder = bld->gallivm->builder;
982 LLVMValueRef delta;
983 LLVMValueRef res;
984
985 assert(lp_check_value(bld->type, x));
986 assert(lp_check_value(bld->type, v0));
987 assert(lp_check_value(bld->type, v1));
988
989 delta = lp_build_sub(bld, v1, v0);
990
991 res = lp_build_mul(bld, x, delta);
992
993 if (normalized) {
994 if (bld->type.sign) {
995 res = lp_build_shr_imm(bld, res, half_width - 1);
996 } else {
997 res = lp_build_shr_imm(bld, res, half_width);
998 }
999 }
1000
1001 res = lp_build_add(bld, v0, res);
1002
1003 if ((normalized && !bld->type.sign) || bld->type.fixed) {
1004 /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
1005 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
1006 * but it will be wrong for true fixed point use cases. Basically we need
1007 * a more powerful lp_type, capable of further distinguishing the values
1008 * interpretation from the value storage. */
1009 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
1010 }
1011
1012 return res;
1013 }
1014
1015
1016 /**
1017 * Linear interpolation.
1018 */
1019 LLVMValueRef
1020 lp_build_lerp(struct lp_build_context *bld,
1021 LLVMValueRef x,
1022 LLVMValueRef v0,
1023 LLVMValueRef v1)
1024 {
1025 LLVMBuilderRef builder = bld->gallivm->builder;
1026 const struct lp_type type = bld->type;
1027 LLVMValueRef res;
1028
1029 assert(lp_check_value(type, x));
1030 assert(lp_check_value(type, v0));
1031 assert(lp_check_value(type, v1));
1032
1033 if (type.norm) {
1034 struct lp_type wide_type;
1035 struct lp_build_context wide_bld;
1036 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1037 unsigned bits;
1038 LLVMValueRef shift;
1039
1040 assert(type.length >= 2);
1041
1042 /*
1043 * Create a wider integer type, enough to hold the
1044 * intermediate result of the multiplication.
1045 */
1046 memset(&wide_type, 0, sizeof wide_type);
1047 wide_type.sign = type.sign;
1048 wide_type.width = type.width*2;
1049 wide_type.length = type.length/2;
1050
1051 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1052
1053 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
1054 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1055 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1056
1057 /*
1058 * Scale x from [0, 255] to [0, 256]
1059 */
1060
1061 bits = type.width - 1;
1062 if (type.sign) {
1063 --bits;
1064 }
1065
1066 shift = lp_build_const_int_vec(bld->gallivm, wide_type, bits - 1);
1067
1068 xl = lp_build_add(&wide_bld, xl,
1069 LLVMBuildAShr(builder, xl, shift, ""));
1070 xh = lp_build_add(&wide_bld, xh,
1071 LLVMBuildAShr(builder, xh, shift, ""));
1072
1073 /*
1074 * Lerp both halves.
1075 */
1076
1077 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, TRUE);
1078 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, TRUE);
1079
1080 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
1081 } else {
1082 res = lp_build_lerp_simple(bld, x, v0, v1, FALSE);
1083 }
1084
1085 return res;
1086 }
1087
1088
1089 LLVMValueRef
1090 lp_build_lerp_2d(struct lp_build_context *bld,
1091 LLVMValueRef x,
1092 LLVMValueRef y,
1093 LLVMValueRef v00,
1094 LLVMValueRef v01,
1095 LLVMValueRef v10,
1096 LLVMValueRef v11)
1097 {
1098 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
1099 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
1100 return lp_build_lerp(bld, y, v0, v1);
1101 }
1102
1103
1104 /**
1105 * Generate min(a, b)
1106 * Do checks for special cases.
1107 */
1108 LLVMValueRef
1109 lp_build_min(struct lp_build_context *bld,
1110 LLVMValueRef a,
1111 LLVMValueRef b)
1112 {
1113 assert(lp_check_value(bld->type, a));
1114 assert(lp_check_value(bld->type, b));
1115
1116 if(a == bld->undef || b == bld->undef)
1117 return bld->undef;
1118
1119 if(a == b)
1120 return a;
1121
1122 if (bld->type.norm) {
1123 if (!bld->type.sign) {
1124 if (a == bld->zero || b == bld->zero) {
1125 return bld->zero;
1126 }
1127 }
1128 if(a == bld->one)
1129 return b;
1130 if(b == bld->one)
1131 return a;
1132 }
1133
1134 return lp_build_min_simple(bld, a, b);
1135 }
1136
1137
1138 /**
1139 * Generate max(a, b)
1140 * Do checks for special cases.
1141 */
1142 LLVMValueRef
1143 lp_build_max(struct lp_build_context *bld,
1144 LLVMValueRef a,
1145 LLVMValueRef b)
1146 {
1147 assert(lp_check_value(bld->type, a));
1148 assert(lp_check_value(bld->type, b));
1149
1150 if(a == bld->undef || b == bld->undef)
1151 return bld->undef;
1152
1153 if(a == b)
1154 return a;
1155
1156 if(bld->type.norm) {
1157 if(a == bld->one || b == bld->one)
1158 return bld->one;
1159 if (!bld->type.sign) {
1160 if (a == bld->zero) {
1161 return b;
1162 }
1163 if (b == bld->zero) {
1164 return a;
1165 }
1166 }
1167 }
1168
1169 return lp_build_max_simple(bld, a, b);
1170 }
1171
1172
1173 /**
1174 * Generate clamp(a, min, max)
1175 * Do checks for special cases.
1176 */
1177 LLVMValueRef
1178 lp_build_clamp(struct lp_build_context *bld,
1179 LLVMValueRef a,
1180 LLVMValueRef min,
1181 LLVMValueRef max)
1182 {
1183 assert(lp_check_value(bld->type, a));
1184 assert(lp_check_value(bld->type, min));
1185 assert(lp_check_value(bld->type, max));
1186
1187 a = lp_build_min(bld, a, max);
1188 a = lp_build_max(bld, a, min);
1189 return a;
1190 }
1191
1192
1193 /**
1194 * Generate abs(a)
1195 */
1196 LLVMValueRef
1197 lp_build_abs(struct lp_build_context *bld,
1198 LLVMValueRef a)
1199 {
1200 LLVMBuilderRef builder = bld->gallivm->builder;
1201 const struct lp_type type = bld->type;
1202 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1203
1204 assert(lp_check_value(type, a));
1205
1206 if(!type.sign)
1207 return a;
1208
1209 if(type.floating) {
1210 /* Mask out the sign bit */
1211 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1212 unsigned long long absMask = ~(1ULL << (type.width - 1));
1213 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1214 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1215 a = LLVMBuildAnd(builder, a, mask, "");
1216 a = LLVMBuildBitCast(builder, a, vec_type, "");
1217 return a;
1218 }
1219
1220 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
1221 switch(type.width) {
1222 case 8:
1223 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1224 case 16:
1225 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1226 case 32:
1227 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1228 }
1229 }
1230 else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
1231 (gallivm_debug & GALLIVM_DEBUG_PERF) &&
1232 (type.width == 8 || type.width == 16 || type.width == 32)) {
1233 debug_printf("%s: inefficient code, should split vectors manually\n",
1234 __FUNCTION__);
1235 }
1236
1237 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
1238 }
1239
1240
1241 LLVMValueRef
1242 lp_build_negate(struct lp_build_context *bld,
1243 LLVMValueRef a)
1244 {
1245 LLVMBuilderRef builder = bld->gallivm->builder;
1246
1247 assert(lp_check_value(bld->type, a));
1248
1249 #if HAVE_LLVM >= 0x0207
1250 if (bld->type.floating)
1251 a = LLVMBuildFNeg(builder, a, "");
1252 else
1253 #endif
1254 a = LLVMBuildNeg(builder, a, "");
1255
1256 return a;
1257 }
1258
1259
1260 /** Return -1, 0 or +1 depending on the sign of a */
1261 LLVMValueRef
1262 lp_build_sgn(struct lp_build_context *bld,
1263 LLVMValueRef a)
1264 {
1265 LLVMBuilderRef builder = bld->gallivm->builder;
1266 const struct lp_type type = bld->type;
1267 LLVMValueRef cond;
1268 LLVMValueRef res;
1269
1270 assert(lp_check_value(type, a));
1271
1272 /* Handle non-zero case */
1273 if(!type.sign) {
1274 /* if not zero then sign must be positive */
1275 res = bld->one;
1276 }
1277 else if(type.floating) {
1278 LLVMTypeRef vec_type;
1279 LLVMTypeRef int_type;
1280 LLVMValueRef mask;
1281 LLVMValueRef sign;
1282 LLVMValueRef one;
1283 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1284
1285 int_type = lp_build_int_vec_type(bld->gallivm, type);
1286 vec_type = lp_build_vec_type(bld->gallivm, type);
1287 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1288
1289 /* Take the sign bit and add it to 1 constant */
1290 sign = LLVMBuildBitCast(builder, a, int_type, "");
1291 sign = LLVMBuildAnd(builder, sign, mask, "");
1292 one = LLVMConstBitCast(bld->one, int_type);
1293 res = LLVMBuildOr(builder, sign, one, "");
1294 res = LLVMBuildBitCast(builder, res, vec_type, "");
1295 }
1296 else
1297 {
1298 /* signed int/norm/fixed point */
1299 /* could use psign with sse3 and appropriate vectors here */
1300 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1301 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1302 res = lp_build_select(bld, cond, bld->one, minus_one);
1303 }
1304
1305 /* Handle zero */
1306 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1307 res = lp_build_select(bld, cond, bld->zero, res);
1308
1309 return res;
1310 }
1311
1312
1313 /**
1314 * Set the sign of float vector 'a' according to 'sign'.
1315 * If sign==0, return abs(a).
1316 * If sign==1, return -abs(a);
1317 * Other values for sign produce undefined results.
1318 */
1319 LLVMValueRef
1320 lp_build_set_sign(struct lp_build_context *bld,
1321 LLVMValueRef a, LLVMValueRef sign)
1322 {
1323 LLVMBuilderRef builder = bld->gallivm->builder;
1324 const struct lp_type type = bld->type;
1325 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1326 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1327 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1328 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1329 ~((unsigned long long) 1 << (type.width - 1)));
1330 LLVMValueRef val, res;
1331
1332 assert(type.floating);
1333 assert(lp_check_value(type, a));
1334
1335 /* val = reinterpret_cast<int>(a) */
1336 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1337 /* val = val & mask */
1338 val = LLVMBuildAnd(builder, val, mask, "");
1339 /* sign = sign << shift */
1340 sign = LLVMBuildShl(builder, sign, shift, "");
1341 /* res = val | sign */
1342 res = LLVMBuildOr(builder, val, sign, "");
1343 /* res = reinterpret_cast<float>(res) */
1344 res = LLVMBuildBitCast(builder, res, vec_type, "");
1345
1346 return res;
1347 }
1348
1349
1350 /**
1351 * Convert vector of (or scalar) int to vector of (or scalar) float.
1352 */
1353 LLVMValueRef
1354 lp_build_int_to_float(struct lp_build_context *bld,
1355 LLVMValueRef a)
1356 {
1357 LLVMBuilderRef builder = bld->gallivm->builder;
1358 const struct lp_type type = bld->type;
1359 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1360
1361 assert(type.floating);
1362
1363 return LLVMBuildSIToFP(builder, a, vec_type, "");
1364 }
1365
1366 static boolean
1367 arch_rounding_available(const struct lp_type type)
1368 {
1369 if ((util_cpu_caps.has_sse4_1 &&
1370 (type.length == 1 || type.width*type.length == 128)) ||
1371 (util_cpu_caps.has_avx && type.width*type.length == 256))
1372 return TRUE;
1373 else if ((util_cpu_caps.has_altivec &&
1374 (type.width == 32 && type.length == 4)))
1375 return TRUE;
1376
1377 return FALSE;
1378 }
1379
1380 enum lp_build_round_mode
1381 {
1382 LP_BUILD_ROUND_NEAREST = 0,
1383 LP_BUILD_ROUND_FLOOR = 1,
1384 LP_BUILD_ROUND_CEIL = 2,
1385 LP_BUILD_ROUND_TRUNCATE = 3
1386 };
1387
1388 /**
1389 * Helper for SSE4.1's ROUNDxx instructions.
1390 *
1391 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1392 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1393 */
1394 static INLINE LLVMValueRef
1395 lp_build_round_sse41(struct lp_build_context *bld,
1396 LLVMValueRef a,
1397 enum lp_build_round_mode mode)
1398 {
1399 LLVMBuilderRef builder = bld->gallivm->builder;
1400 const struct lp_type type = bld->type;
1401 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1402 const char *intrinsic;
1403 LLVMValueRef res;
1404
1405 assert(type.floating);
1406
1407 assert(lp_check_value(type, a));
1408 assert(util_cpu_caps.has_sse4_1);
1409
1410 if (type.length == 1) {
1411 LLVMTypeRef vec_type;
1412 LLVMValueRef undef;
1413 LLVMValueRef args[3];
1414 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1415
1416 switch(type.width) {
1417 case 32:
1418 intrinsic = "llvm.x86.sse41.round.ss";
1419 break;
1420 case 64:
1421 intrinsic = "llvm.x86.sse41.round.sd";
1422 break;
1423 default:
1424 assert(0);
1425 return bld->undef;
1426 }
1427
1428 vec_type = LLVMVectorType(bld->elem_type, 4);
1429
1430 undef = LLVMGetUndef(vec_type);
1431
1432 args[0] = undef;
1433 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1434 args[2] = LLVMConstInt(i32t, mode, 0);
1435
1436 res = lp_build_intrinsic(builder, intrinsic,
1437 vec_type, args, Elements(args));
1438
1439 res = LLVMBuildExtractElement(builder, res, index0, "");
1440 }
1441 else {
1442 if (type.width * type.length == 128) {
1443 switch(type.width) {
1444 case 32:
1445 intrinsic = "llvm.x86.sse41.round.ps";
1446 break;
1447 case 64:
1448 intrinsic = "llvm.x86.sse41.round.pd";
1449 break;
1450 default:
1451 assert(0);
1452 return bld->undef;
1453 }
1454 }
1455 else {
1456 assert(type.width * type.length == 256);
1457 assert(util_cpu_caps.has_avx);
1458
1459 switch(type.width) {
1460 case 32:
1461 intrinsic = "llvm.x86.avx.round.ps.256";
1462 break;
1463 case 64:
1464 intrinsic = "llvm.x86.avx.round.pd.256";
1465 break;
1466 default:
1467 assert(0);
1468 return bld->undef;
1469 }
1470 }
1471
1472 res = lp_build_intrinsic_binary(builder, intrinsic,
1473 bld->vec_type, a,
1474 LLVMConstInt(i32t, mode, 0));
1475 }
1476
1477 return res;
1478 }
1479
1480
1481 static INLINE LLVMValueRef
1482 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1483 LLVMValueRef a)
1484 {
1485 LLVMBuilderRef builder = bld->gallivm->builder;
1486 const struct lp_type type = bld->type;
1487 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1488 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1489 const char *intrinsic;
1490 LLVMValueRef res;
1491
1492 assert(type.floating);
1493 /* using the double precision conversions is a bit more complicated */
1494 assert(type.width == 32);
1495
1496 assert(lp_check_value(type, a));
1497 assert(util_cpu_caps.has_sse2);
1498
1499 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1500 if (type.length == 1) {
1501 LLVMTypeRef vec_type;
1502 LLVMValueRef undef;
1503 LLVMValueRef arg;
1504 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1505
1506 vec_type = LLVMVectorType(bld->elem_type, 4);
1507
1508 intrinsic = "llvm.x86.sse.cvtss2si";
1509
1510 undef = LLVMGetUndef(vec_type);
1511
1512 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1513
1514 res = lp_build_intrinsic_unary(builder, intrinsic,
1515 ret_type, arg);
1516 }
1517 else {
1518 if (type.width* type.length == 128) {
1519 intrinsic = "llvm.x86.sse2.cvtps2dq";
1520 }
1521 else {
1522 assert(type.width*type.length == 256);
1523 assert(util_cpu_caps.has_avx);
1524
1525 intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1526 }
1527 res = lp_build_intrinsic_unary(builder, intrinsic,
1528 ret_type, a);
1529 }
1530
1531 return res;
1532 }
1533
1534
1535 /*
1536 */
1537 static INLINE LLVMValueRef
1538 lp_build_round_altivec(struct lp_build_context *bld,
1539 LLVMValueRef a,
1540 enum lp_build_round_mode mode)
1541 {
1542 LLVMBuilderRef builder = bld->gallivm->builder;
1543 const struct lp_type type = bld->type;
1544 const char *intrinsic = NULL;
1545
1546 assert(type.floating);
1547
1548 assert(lp_check_value(type, a));
1549 assert(util_cpu_caps.has_altivec);
1550
1551 switch (mode) {
1552 case LP_BUILD_ROUND_NEAREST:
1553 intrinsic = "llvm.ppc.altivec.vrfin";
1554 break;
1555 case LP_BUILD_ROUND_FLOOR:
1556 intrinsic = "llvm.ppc.altivec.vrfim";
1557 break;
1558 case LP_BUILD_ROUND_CEIL:
1559 intrinsic = "llvm.ppc.altivec.vrfip";
1560 break;
1561 case LP_BUILD_ROUND_TRUNCATE:
1562 intrinsic = "llvm.ppc.altivec.vrfiz";
1563 break;
1564 }
1565
1566 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1567 }
1568
1569 static INLINE LLVMValueRef
1570 lp_build_round_arch(struct lp_build_context *bld,
1571 LLVMValueRef a,
1572 enum lp_build_round_mode mode)
1573 {
1574 if (util_cpu_caps.has_sse4_1)
1575 return lp_build_round_sse41(bld, a, mode);
1576 else /* (util_cpu_caps.has_altivec) */
1577 return lp_build_round_altivec(bld, a, mode);
1578 }
1579
1580 /**
1581 * Return the integer part of a float (vector) value (== round toward zero).
1582 * The returned value is a float (vector).
1583 * Ex: trunc(-1.5) = -1.0
1584 */
1585 LLVMValueRef
1586 lp_build_trunc(struct lp_build_context *bld,
1587 LLVMValueRef a)
1588 {
1589 LLVMBuilderRef builder = bld->gallivm->builder;
1590 const struct lp_type type = bld->type;
1591
1592 assert(type.floating);
1593 assert(lp_check_value(type, a));
1594
1595 if (arch_rounding_available(type)) {
1596 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
1597 }
1598 else {
1599 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1600 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1601 LLVMValueRef res;
1602 res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1603 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1604 return res;
1605 }
1606 }
1607
1608
1609 /**
1610 * Return float (vector) rounded to nearest integer (vector). The returned
1611 * value is a float (vector).
1612 * Ex: round(0.9) = 1.0
1613 * Ex: round(-1.5) = -2.0
1614 */
1615 LLVMValueRef
1616 lp_build_round(struct lp_build_context *bld,
1617 LLVMValueRef a)
1618 {
1619 LLVMBuilderRef builder = bld->gallivm->builder;
1620 const struct lp_type type = bld->type;
1621
1622 assert(type.floating);
1623 assert(lp_check_value(type, a));
1624
1625 if (arch_rounding_available(type)) {
1626 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1627 }
1628 else {
1629 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1630 LLVMValueRef res;
1631 res = lp_build_iround(bld, a);
1632 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1633 return res;
1634 }
1635 }
1636
1637
1638 /**
1639 * Return floor of float (vector), result is a float (vector)
1640 * Ex: floor(1.1) = 1.0
1641 * Ex: floor(-1.1) = -2.0
1642 */
1643 LLVMValueRef
1644 lp_build_floor(struct lp_build_context *bld,
1645 LLVMValueRef a)
1646 {
1647 LLVMBuilderRef builder = bld->gallivm->builder;
1648 const struct lp_type type = bld->type;
1649
1650 assert(type.floating);
1651 assert(lp_check_value(type, a));
1652
1653 if (arch_rounding_available(type)) {
1654 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1655 }
1656 else {
1657 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1658 LLVMValueRef res;
1659 res = lp_build_ifloor(bld, a);
1660 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1661 return res;
1662 }
1663 }
1664
1665
1666 /**
1667 * Return ceiling of float (vector), returning float (vector).
1668 * Ex: ceil( 1.1) = 2.0
1669 * Ex: ceil(-1.1) = -1.0
1670 */
1671 LLVMValueRef
1672 lp_build_ceil(struct lp_build_context *bld,
1673 LLVMValueRef a)
1674 {
1675 LLVMBuilderRef builder = bld->gallivm->builder;
1676 const struct lp_type type = bld->type;
1677
1678 assert(type.floating);
1679 assert(lp_check_value(type, a));
1680
1681 if (arch_rounding_available(type)) {
1682 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1683 }
1684 else {
1685 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1686 LLVMValueRef res;
1687 res = lp_build_iceil(bld, a);
1688 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1689 return res;
1690 }
1691 }
1692
1693
1694 /**
1695 * Return fractional part of 'a' computed as a - floor(a)
1696 * Typically used in texture coord arithmetic.
1697 */
1698 LLVMValueRef
1699 lp_build_fract(struct lp_build_context *bld,
1700 LLVMValueRef a)
1701 {
1702 assert(bld->type.floating);
1703 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1704 }
1705
1706
1707 /**
1708 * Prevent returning a fractional part of 1.0 for very small negative values of
1709 * 'a' by clamping against 0.99999(9).
1710 */
1711 static inline LLVMValueRef
1712 clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
1713 {
1714 LLVMValueRef max;
1715
1716 /* this is the largest number smaller than 1.0 representable as float */
1717 max = lp_build_const_vec(bld->gallivm, bld->type,
1718 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
1719 return lp_build_min(bld, fract, max);
1720 }
1721
1722
1723 /**
1724 * Same as lp_build_fract, but guarantees that the result is always smaller
1725 * than one.
1726 */
1727 LLVMValueRef
1728 lp_build_fract_safe(struct lp_build_context *bld,
1729 LLVMValueRef a)
1730 {
1731 return clamp_fract(bld, lp_build_fract(bld, a));
1732 }
1733
1734
1735 /**
1736 * Return the integer part of a float (vector) value (== round toward zero).
1737 * The returned value is an integer (vector).
1738 * Ex: itrunc(-1.5) = -1
1739 */
1740 LLVMValueRef
1741 lp_build_itrunc(struct lp_build_context *bld,
1742 LLVMValueRef a)
1743 {
1744 LLVMBuilderRef builder = bld->gallivm->builder;
1745 const struct lp_type type = bld->type;
1746 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1747
1748 assert(type.floating);
1749 assert(lp_check_value(type, a));
1750
1751 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1752 }
1753
1754
1755 /**
1756 * Return float (vector) rounded to nearest integer (vector). The returned
1757 * value is an integer (vector).
1758 * Ex: iround(0.9) = 1
1759 * Ex: iround(-1.5) = -2
1760 */
1761 LLVMValueRef
1762 lp_build_iround(struct lp_build_context *bld,
1763 LLVMValueRef a)
1764 {
1765 LLVMBuilderRef builder = bld->gallivm->builder;
1766 const struct lp_type type = bld->type;
1767 LLVMTypeRef int_vec_type = bld->int_vec_type;
1768 LLVMValueRef res;
1769
1770 assert(type.floating);
1771
1772 assert(lp_check_value(type, a));
1773
1774 if ((util_cpu_caps.has_sse2 &&
1775 ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
1776 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
1777 return lp_build_iround_nearest_sse2(bld, a);
1778 }
1779 if (arch_rounding_available(type)) {
1780 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
1781 }
1782 else {
1783 LLVMValueRef half;
1784
1785 half = lp_build_const_vec(bld->gallivm, type, 0.5);
1786
1787 if (type.sign) {
1788 LLVMTypeRef vec_type = bld->vec_type;
1789 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1790 (unsigned long long)1 << (type.width - 1));
1791 LLVMValueRef sign;
1792
1793 /* get sign bit */
1794 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1795 sign = LLVMBuildAnd(builder, sign, mask, "");
1796
1797 /* sign * 0.5 */
1798 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1799 half = LLVMBuildOr(builder, sign, half, "");
1800 half = LLVMBuildBitCast(builder, half, vec_type, "");
1801 }
1802
1803 res = LLVMBuildFAdd(builder, a, half, "");
1804 }
1805
1806 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1807
1808 return res;
1809 }
1810
1811
1812 /**
1813 * Return floor of float (vector), result is an int (vector)
1814 * Ex: ifloor(1.1) = 1.0
1815 * Ex: ifloor(-1.1) = -2.0
1816 */
1817 LLVMValueRef
1818 lp_build_ifloor(struct lp_build_context *bld,
1819 LLVMValueRef a)
1820 {
1821 LLVMBuilderRef builder = bld->gallivm->builder;
1822 const struct lp_type type = bld->type;
1823 LLVMTypeRef int_vec_type = bld->int_vec_type;
1824 LLVMValueRef res;
1825
1826 assert(type.floating);
1827 assert(lp_check_value(type, a));
1828
1829 res = a;
1830 if (type.sign) {
1831 if (arch_rounding_available(type)) {
1832 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
1833 }
1834 else {
1835 /* Take the sign bit and add it to 1 constant */
1836 LLVMTypeRef vec_type = bld->vec_type;
1837 unsigned mantissa = lp_mantissa(type);
1838 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1839 (unsigned long long)1 << (type.width - 1));
1840 LLVMValueRef sign;
1841 LLVMValueRef offset;
1842
1843 /* sign = a < 0 ? ~0 : 0 */
1844 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1845 sign = LLVMBuildAnd(builder, sign, mask, "");
1846 sign = LLVMBuildAShr(builder, sign,
1847 lp_build_const_int_vec(bld->gallivm, type,
1848 type.width - 1),
1849 "ifloor.sign");
1850
1851 /* offset = -0.99999(9)f */
1852 offset = lp_build_const_vec(bld->gallivm, type,
1853 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1854 offset = LLVMConstBitCast(offset, int_vec_type);
1855
1856 /* offset = a < 0 ? offset : 0.0f */
1857 offset = LLVMBuildAnd(builder, offset, sign, "");
1858 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1859
1860 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1861 }
1862 }
1863
1864 /* round to nearest (toward zero) */
1865 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1866
1867 return res;
1868 }
1869
1870
1871 /**
1872 * Return ceiling of float (vector), returning int (vector).
1873 * Ex: iceil( 1.1) = 2
1874 * Ex: iceil(-1.1) = -1
1875 */
1876 LLVMValueRef
1877 lp_build_iceil(struct lp_build_context *bld,
1878 LLVMValueRef a)
1879 {
1880 LLVMBuilderRef builder = bld->gallivm->builder;
1881 const struct lp_type type = bld->type;
1882 LLVMTypeRef int_vec_type = bld->int_vec_type;
1883 LLVMValueRef res;
1884
1885 assert(type.floating);
1886 assert(lp_check_value(type, a));
1887
1888 if (arch_rounding_available(type)) {
1889 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
1890 }
1891 else {
1892 LLVMTypeRef vec_type = bld->vec_type;
1893 unsigned mantissa = lp_mantissa(type);
1894 LLVMValueRef offset;
1895
1896 /* offset = 0.99999(9)f */
1897 offset = lp_build_const_vec(bld->gallivm, type,
1898 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1899
1900 if (type.sign) {
1901 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1902 (unsigned long long)1 << (type.width - 1));
1903 LLVMValueRef sign;
1904
1905 /* sign = a < 0 ? 0 : ~0 */
1906 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1907 sign = LLVMBuildAnd(builder, sign, mask, "");
1908 sign = LLVMBuildAShr(builder, sign,
1909 lp_build_const_int_vec(bld->gallivm, type,
1910 type.width - 1),
1911 "iceil.sign");
1912 sign = LLVMBuildNot(builder, sign, "iceil.not");
1913
1914 /* offset = a < 0 ? 0.0 : offset */
1915 offset = LLVMConstBitCast(offset, int_vec_type);
1916 offset = LLVMBuildAnd(builder, offset, sign, "");
1917 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1918 }
1919
1920 res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1921 }
1922
1923 /* round to nearest (toward zero) */
1924 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1925
1926 return res;
1927 }
1928
1929
1930 /**
1931 * Combined ifloor() & fract().
1932 *
1933 * Preferred to calling the functions separately, as it will ensure that the
1934 * strategy (floor() vs ifloor()) that results in less redundant work is used.
1935 */
1936 void
1937 lp_build_ifloor_fract(struct lp_build_context *bld,
1938 LLVMValueRef a,
1939 LLVMValueRef *out_ipart,
1940 LLVMValueRef *out_fpart)
1941 {
1942 LLVMBuilderRef builder = bld->gallivm->builder;
1943 const struct lp_type type = bld->type;
1944 LLVMValueRef ipart;
1945
1946 assert(type.floating);
1947 assert(lp_check_value(type, a));
1948
1949 if (arch_rounding_available(type)) {
1950 /*
1951 * floor() is easier.
1952 */
1953
1954 ipart = lp_build_floor(bld, a);
1955 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1956 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1957 }
1958 else {
1959 /*
1960 * ifloor() is easier.
1961 */
1962
1963 *out_ipart = lp_build_ifloor(bld, a);
1964 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1965 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1966 }
1967 }
1968
1969
1970 /**
1971 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
1972 * always smaller than one.
1973 */
1974 void
1975 lp_build_ifloor_fract_safe(struct lp_build_context *bld,
1976 LLVMValueRef a,
1977 LLVMValueRef *out_ipart,
1978 LLVMValueRef *out_fpart)
1979 {
1980 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
1981 *out_fpart = clamp_fract(bld, *out_fpart);
1982 }
1983
1984
1985 LLVMValueRef
1986 lp_build_sqrt(struct lp_build_context *bld,
1987 LLVMValueRef a)
1988 {
1989 LLVMBuilderRef builder = bld->gallivm->builder;
1990 const struct lp_type type = bld->type;
1991 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1992 char intrinsic[32];
1993
1994 assert(lp_check_value(type, a));
1995
1996 /* TODO: optimize the constant case */
1997
1998 assert(type.floating);
1999 if (type.length == 1) {
2000 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
2001 }
2002 else {
2003 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
2004 }
2005
2006 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2007 }
2008
2009
2010 /**
2011 * Do one Newton-Raphson step to improve reciprocate precision:
2012 *
2013 * x_{i+1} = x_i * (2 - a * x_i)
2014 *
2015 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2016 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
2017 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2018 * halo. It would be necessary to clamp the argument to prevent this.
2019 *
2020 * See also:
2021 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2022 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2023 */
2024 static INLINE LLVMValueRef
2025 lp_build_rcp_refine(struct lp_build_context *bld,
2026 LLVMValueRef a,
2027 LLVMValueRef rcp_a)
2028 {
2029 LLVMBuilderRef builder = bld->gallivm->builder;
2030 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2031 LLVMValueRef res;
2032
2033 res = LLVMBuildFMul(builder, a, rcp_a, "");
2034 res = LLVMBuildFSub(builder, two, res, "");
2035 res = LLVMBuildFMul(builder, rcp_a, res, "");
2036
2037 return res;
2038 }
2039
2040
2041 LLVMValueRef
2042 lp_build_rcp(struct lp_build_context *bld,
2043 LLVMValueRef a)
2044 {
2045 LLVMBuilderRef builder = bld->gallivm->builder;
2046 const struct lp_type type = bld->type;
2047
2048 assert(lp_check_value(type, a));
2049
2050 if(a == bld->zero)
2051 return bld->undef;
2052 if(a == bld->one)
2053 return bld->one;
2054 if(a == bld->undef)
2055 return bld->undef;
2056
2057 assert(type.floating);
2058
2059 if(LLVMIsConstant(a))
2060 return LLVMConstFDiv(bld->one, a);
2061
2062 /*
2063 * We don't use RCPPS because:
2064 * - it only has 10bits of precision
2065 * - it doesn't even get the reciprocate of 1.0 exactly
2066 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2067 * - for recent processors the benefit over DIVPS is marginal, a case
2068 * dependent
2069 *
2070 * We could still use it on certain processors if benchmarks show that the
2071 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2072 * particular uses that require less workarounds.
2073 */
2074
2075 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2076 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2077 const unsigned num_iterations = 0;
2078 LLVMValueRef res;
2079 unsigned i;
2080 const char *intrinsic = NULL;
2081
2082 if (type.length == 4) {
2083 intrinsic = "llvm.x86.sse.rcp.ps";
2084 }
2085 else {
2086 intrinsic = "llvm.x86.avx.rcp.ps.256";
2087 }
2088
2089 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2090
2091 for (i = 0; i < num_iterations; ++i) {
2092 res = lp_build_rcp_refine(bld, a, res);
2093 }
2094
2095 return res;
2096 }
2097
2098 return LLVMBuildFDiv(builder, bld->one, a, "");
2099 }
2100
2101
2102 /**
2103 * Do one Newton-Raphson step to improve rsqrt precision:
2104 *
2105 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2106 *
2107 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2108 */
2109 static INLINE LLVMValueRef
2110 lp_build_rsqrt_refine(struct lp_build_context *bld,
2111 LLVMValueRef a,
2112 LLVMValueRef rsqrt_a)
2113 {
2114 LLVMBuilderRef builder = bld->gallivm->builder;
2115 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2116 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2117 LLVMValueRef res;
2118
2119 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2120 res = LLVMBuildFMul(builder, a, res, "");
2121 res = LLVMBuildFSub(builder, three, res, "");
2122 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2123 res = LLVMBuildFMul(builder, half, res, "");
2124
2125 return res;
2126 }
2127
2128
2129 /**
2130 * Generate 1/sqrt(a).
2131 * Result is undefined for values < 0, infinity for +0.
2132 */
2133 LLVMValueRef
2134 lp_build_rsqrt(struct lp_build_context *bld,
2135 LLVMValueRef a)
2136 {
2137 LLVMBuilderRef builder = bld->gallivm->builder;
2138 const struct lp_type type = bld->type;
2139
2140 assert(lp_check_value(type, a));
2141
2142 assert(type.floating);
2143
2144 /*
2145 * This should be faster but all denormals will end up as infinity.
2146 */
2147 if (0 && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2148 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))) {
2149 const unsigned num_iterations = 1;
2150 LLVMValueRef res;
2151 unsigned i;
2152 const char *intrinsic = NULL;
2153
2154 if (type.length == 4) {
2155 intrinsic = "llvm.x86.sse.rsqrt.ps";
2156 }
2157 else {
2158 intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2159 }
2160 if (num_iterations) {
2161 /*
2162 * Newton-Raphson will result in NaN instead of infinity for zero,
2163 * and NaN instead of zero for infinity.
2164 * Also, need to ensure rsqrt(1.0) == 1.0.
2165 * All numbers smaller than FLT_MIN will result in +infinity
2166 * (rsqrtps treats all denormals as zero).
2167 */
2168 /*
2169 * Certain non-c99 compilers don't know INFINITY and might not support
2170 * hacks to evaluate it at compile time neither.
2171 */
2172 const unsigned posinf_int = 0x7F800000;
2173 LLVMValueRef cmp;
2174 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2175 LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
2176
2177 inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
2178
2179 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2180
2181 for (i = 0; i < num_iterations; ++i) {
2182 res = lp_build_rsqrt_refine(bld, a, res);
2183 }
2184 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2185 res = lp_build_select(bld, cmp, inf, res);
2186 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2187 res = lp_build_select(bld, cmp, bld->zero, res);
2188 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2189 res = lp_build_select(bld, cmp, bld->one, res);
2190 }
2191 else {
2192 /* rsqrt(1.0) != 1.0 here */
2193 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2194
2195 }
2196
2197 return res;
2198 }
2199
2200 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2201 }
2202
2203
2204 /**
2205 * Generate sin(a) using SSE2
2206 */
2207 LLVMValueRef
2208 lp_build_sin(struct lp_build_context *bld,
2209 LLVMValueRef a)
2210 {
2211 struct gallivm_state *gallivm = bld->gallivm;
2212 LLVMBuilderRef builder = gallivm->builder;
2213 struct lp_type int_type = lp_int_type(bld->type);
2214 LLVMBuilderRef b = builder;
2215
2216 /*
2217 * take the absolute value,
2218 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2219 */
2220
2221 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2222 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2223
2224 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2225 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2226
2227 /*
2228 * extract the sign bit (upper one)
2229 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
2230 */
2231 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2232 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
2233
2234 /*
2235 * scale by 4/Pi
2236 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2237 */
2238
2239 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2240 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2241
2242 /*
2243 * store the integer part of y in mm0
2244 * emm2 = _mm_cvttps_epi32(y);
2245 */
2246
2247 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2248
2249 /*
2250 * j=(j+1) & (~1) (see the cephes sources)
2251 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2252 */
2253
2254 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2255 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2256 /*
2257 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2258 */
2259 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2260 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2261
2262 /*
2263 * y = _mm_cvtepi32_ps(emm2);
2264 */
2265 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2266
2267 /* get the swap sign flag
2268 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
2269 */
2270 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2271 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
2272
2273 /*
2274 * emm2 = _mm_slli_epi32(emm0, 29);
2275 */
2276 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2277 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
2278
2279 /*
2280 * get the polynom selection mask
2281 * there is one polynom for 0 <= x <= Pi/4
2282 * and another one for Pi/4<x<=Pi/2
2283 * Both branches will be computed.
2284 *
2285 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2286 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2287 */
2288
2289 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2290 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
2291 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2292 int_type, PIPE_FUNC_EQUAL,
2293 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2294 /*
2295 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
2296 */
2297 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
2298
2299 /*
2300 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2301 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2302 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2303 */
2304 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2305 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2306 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2307
2308 /*
2309 * The magic pass: "Extended precision modular arithmetic"
2310 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2311 * xmm1 = _mm_mul_ps(y, xmm1);
2312 * xmm2 = _mm_mul_ps(y, xmm2);
2313 * xmm3 = _mm_mul_ps(y, xmm3);
2314 */
2315 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2316 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2317 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2318
2319 /*
2320 * x = _mm_add_ps(x, xmm1);
2321 * x = _mm_add_ps(x, xmm2);
2322 * x = _mm_add_ps(x, xmm3);
2323 */
2324
2325 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2326 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2327 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2328
2329 /*
2330 * Evaluate the first polynom (0 <= x <= Pi/4)
2331 *
2332 * z = _mm_mul_ps(x,x);
2333 */
2334 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2335
2336 /*
2337 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2338 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2339 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2340 */
2341 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2342 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2343 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2344
2345 /*
2346 * y = *(v4sf*)_ps_coscof_p0;
2347 * y = _mm_mul_ps(y, z);
2348 */
2349 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2350 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2351 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2352 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2353 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2354 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2355
2356
2357 /*
2358 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2359 * y = _mm_sub_ps(y, tmp);
2360 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2361 */
2362 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2363 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2364 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2365 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2366 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2367
2368 /*
2369 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2370 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2371 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2372 */
2373 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2374 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2375 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2376
2377 /*
2378 * Evaluate the second polynom (Pi/4 <= x <= 0)
2379 *
2380 * y2 = *(v4sf*)_ps_sincof_p0;
2381 * y2 = _mm_mul_ps(y2, z);
2382 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2383 * y2 = _mm_mul_ps(y2, z);
2384 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2385 * y2 = _mm_mul_ps(y2, z);
2386 * y2 = _mm_mul_ps(y2, x);
2387 * y2 = _mm_add_ps(y2, x);
2388 */
2389
2390 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2391 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2392 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2393 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2394 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2395 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2396 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2397
2398 /*
2399 * select the correct result from the two polynoms
2400 * xmm3 = poly_mask;
2401 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2402 * y = _mm_andnot_ps(xmm3, y);
2403 * y = _mm_add_ps(y,y2);
2404 */
2405 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2406 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2407 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2408 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2409 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2410 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2411 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2412
2413 /*
2414 * update the sign
2415 * y = _mm_xor_ps(y, sign_bit);
2416 */
2417 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
2418 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2419 return y_result;
2420 }
2421
2422
2423 /**
2424 * Generate cos(a) using SSE2
2425 */
2426 LLVMValueRef
2427 lp_build_cos(struct lp_build_context *bld,
2428 LLVMValueRef a)
2429 {
2430 struct gallivm_state *gallivm = bld->gallivm;
2431 LLVMBuilderRef builder = gallivm->builder;
2432 struct lp_type int_type = lp_int_type(bld->type);
2433 LLVMBuilderRef b = builder;
2434
2435 /*
2436 * take the absolute value,
2437 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2438 */
2439
2440 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2441 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2442
2443 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2444 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2445
2446 /*
2447 * scale by 4/Pi
2448 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2449 */
2450
2451 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2452 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2453
2454 /*
2455 * store the integer part of y in mm0
2456 * emm2 = _mm_cvttps_epi32(y);
2457 */
2458
2459 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2460
2461 /*
2462 * j=(j+1) & (~1) (see the cephes sources)
2463 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2464 */
2465
2466 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2467 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2468 /*
2469 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2470 */
2471 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2472 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2473
2474 /*
2475 * y = _mm_cvtepi32_ps(emm2);
2476 */
2477 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2478
2479
2480 /*
2481 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
2482 */
2483 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2484 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
2485
2486
2487 /* get the swap sign flag
2488 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
2489 */
2490 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
2491 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
2492 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2493 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
2494
2495 /*
2496 * emm2 = _mm_slli_epi32(emm0, 29);
2497 */
2498 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2499 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
2500
2501 /*
2502 * get the polynom selection mask
2503 * there is one polynom for 0 <= x <= Pi/4
2504 * and another one for Pi/4<x<=Pi/2
2505 * Both branches will be computed.
2506 *
2507 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2508 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2509 */
2510
2511 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2512 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
2513 LLVMValueRef poly_mask = lp_build_compare(gallivm,
2514 int_type, PIPE_FUNC_EQUAL,
2515 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2516
2517 /*
2518 * _PS_CONST(minus_cephes_DP1, -0.78515625);
2519 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2520 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2521 */
2522 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2523 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2524 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2525
2526 /*
2527 * The magic pass: "Extended precision modular arithmetic"
2528 * x = ((x - y * DP1) - y * DP2) - y * DP3;
2529 * xmm1 = _mm_mul_ps(y, xmm1);
2530 * xmm2 = _mm_mul_ps(y, xmm2);
2531 * xmm3 = _mm_mul_ps(y, xmm3);
2532 */
2533 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
2534 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
2535 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
2536
2537 /*
2538 * x = _mm_add_ps(x, xmm1);
2539 * x = _mm_add_ps(x, xmm2);
2540 * x = _mm_add_ps(x, xmm3);
2541 */
2542
2543 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2544 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2545 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2546
2547 /*
2548 * Evaluate the first polynom (0 <= x <= Pi/4)
2549 *
2550 * z = _mm_mul_ps(x,x);
2551 */
2552 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2553
2554 /*
2555 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2556 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2557 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2558 */
2559 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2560 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2561 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2562
2563 /*
2564 * y = *(v4sf*)_ps_coscof_p0;
2565 * y = _mm_mul_ps(y, z);
2566 */
2567 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2568 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2569 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2570 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2571 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2572 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2573
2574
2575 /*
2576 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2577 * y = _mm_sub_ps(y, tmp);
2578 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2579 */
2580 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2581 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2582 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2583 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2584 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2585
2586 /*
2587 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2588 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2589 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2590 */
2591 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2592 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2593 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2594
2595 /*
2596 * Evaluate the second polynom (Pi/4 <= x <= 0)
2597 *
2598 * y2 = *(v4sf*)_ps_sincof_p0;
2599 * y2 = _mm_mul_ps(y2, z);
2600 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2601 * y2 = _mm_mul_ps(y2, z);
2602 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2603 * y2 = _mm_mul_ps(y2, z);
2604 * y2 = _mm_mul_ps(y2, x);
2605 * y2 = _mm_add_ps(y2, x);
2606 */
2607
2608 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2609 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2610 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2611 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2612 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2613 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2614 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2615
2616 /*
2617 * select the correct result from the two polynoms
2618 * xmm3 = poly_mask;
2619 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2620 * y = _mm_andnot_ps(xmm3, y);
2621 * y = _mm_add_ps(y,y2);
2622 */
2623 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2624 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2625 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2626 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2627 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2628 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2629
2630 /*
2631 * update the sign
2632 * y = _mm_xor_ps(y, sign_bit);
2633 */
2634 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2635 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2636 return y_result;
2637 }
2638
2639
2640 /**
2641 * Generate pow(x, y)
2642 */
2643 LLVMValueRef
2644 lp_build_pow(struct lp_build_context *bld,
2645 LLVMValueRef x,
2646 LLVMValueRef y)
2647 {
2648 /* TODO: optimize the constant case */
2649 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2650 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2651 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2652 __FUNCTION__);
2653 }
2654
2655 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2656 }
2657
2658
2659 /**
2660 * Generate exp(x)
2661 */
2662 LLVMValueRef
2663 lp_build_exp(struct lp_build_context *bld,
2664 LLVMValueRef x)
2665 {
2666 /* log2(e) = 1/log(2) */
2667 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2668 1.4426950408889634);
2669
2670 assert(lp_check_value(bld->type, x));
2671
2672 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2673 }
2674
2675
2676 /**
2677 * Generate log(x)
2678 */
2679 LLVMValueRef
2680 lp_build_log(struct lp_build_context *bld,
2681 LLVMValueRef x)
2682 {
2683 /* log(2) */
2684 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2685 0.69314718055994529);
2686
2687 assert(lp_check_value(bld->type, x));
2688
2689 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2690 }
2691
2692
2693 /**
2694 * Generate polynomial.
2695 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2696 */
2697 static LLVMValueRef
2698 lp_build_polynomial(struct lp_build_context *bld,
2699 LLVMValueRef x,
2700 const double *coeffs,
2701 unsigned num_coeffs)
2702 {
2703 const struct lp_type type = bld->type;
2704 LLVMValueRef even = NULL, odd = NULL;
2705 LLVMValueRef x2;
2706 unsigned i;
2707
2708 assert(lp_check_value(bld->type, x));
2709
2710 /* TODO: optimize the constant case */
2711 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2712 LLVMIsConstant(x)) {
2713 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2714 __FUNCTION__);
2715 }
2716
2717 /*
2718 * Calculate odd and even terms seperately to decrease data dependency
2719 * Ex:
2720 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2721 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2722 */
2723 x2 = lp_build_mul(bld, x, x);
2724
2725 for (i = num_coeffs; i--; ) {
2726 LLVMValueRef coeff;
2727
2728 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2729
2730 if (i % 2 == 0) {
2731 if (even)
2732 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2733 else
2734 even = coeff;
2735 } else {
2736 if (odd)
2737 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2738 else
2739 odd = coeff;
2740 }
2741 }
2742
2743 if (odd)
2744 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2745 else if (even)
2746 return even;
2747 else
2748 return bld->undef;
2749 }
2750
2751
2752 /**
2753 * Minimax polynomial fit of 2**x, in range [0, 1[
2754 */
2755 const double lp_build_exp2_polynomial[] = {
2756 #if EXP_POLY_DEGREE == 5
2757 0.999999925063526176901,
2758 0.693153073200168932794,
2759 0.240153617044375388211,
2760 0.0558263180532956664775,
2761 0.00898934009049466391101,
2762 0.00187757667519147912699
2763 #elif EXP_POLY_DEGREE == 4
2764 1.00000259337069434683,
2765 0.693003834469974940458,
2766 0.24144275689150793076,
2767 0.0520114606103070150235,
2768 0.0135341679161270268764
2769 #elif EXP_POLY_DEGREE == 3
2770 0.999925218562710312959,
2771 0.695833540494823811697,
2772 0.226067155427249155588,
2773 0.0780245226406372992967
2774 #elif EXP_POLY_DEGREE == 2
2775 1.00172476321474503578,
2776 0.657636275736077639316,
2777 0.33718943461968720704
2778 #else
2779 #error
2780 #endif
2781 };
2782
2783
2784 void
2785 lp_build_exp2_approx(struct lp_build_context *bld,
2786 LLVMValueRef x,
2787 LLVMValueRef *p_exp2_int_part,
2788 LLVMValueRef *p_frac_part,
2789 LLVMValueRef *p_exp2)
2790 {
2791 LLVMBuilderRef builder = bld->gallivm->builder;
2792 const struct lp_type type = bld->type;
2793 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2794 LLVMValueRef ipart = NULL;
2795 LLVMValueRef fpart = NULL;
2796 LLVMValueRef expipart = NULL;
2797 LLVMValueRef expfpart = NULL;
2798 LLVMValueRef res = NULL;
2799
2800 assert(lp_check_value(bld->type, x));
2801
2802 if(p_exp2_int_part || p_frac_part || p_exp2) {
2803 /* TODO: optimize the constant case */
2804 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2805 LLVMIsConstant(x)) {
2806 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2807 __FUNCTION__);
2808 }
2809
2810 assert(type.floating && type.width == 32);
2811
2812 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0));
2813 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2814
2815 /* ipart = floor(x) */
2816 /* fpart = x - ipart */
2817 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2818 }
2819
2820 if(p_exp2_int_part || p_exp2) {
2821 /* expipart = (float) (1 << ipart) */
2822 expipart = LLVMBuildAdd(builder, ipart,
2823 lp_build_const_int_vec(bld->gallivm, type, 127), "");
2824 expipart = LLVMBuildShl(builder, expipart,
2825 lp_build_const_int_vec(bld->gallivm, type, 23), "");
2826 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2827 }
2828
2829 if(p_exp2) {
2830 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2831 Elements(lp_build_exp2_polynomial));
2832
2833 res = LLVMBuildFMul(builder, expipart, expfpart, "");
2834 }
2835
2836 if(p_exp2_int_part)
2837 *p_exp2_int_part = expipart;
2838
2839 if(p_frac_part)
2840 *p_frac_part = fpart;
2841
2842 if(p_exp2)
2843 *p_exp2 = res;
2844 }
2845
2846
2847 LLVMValueRef
2848 lp_build_exp2(struct lp_build_context *bld,
2849 LLVMValueRef x)
2850 {
2851 LLVMValueRef res;
2852 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2853 return res;
2854 }
2855
2856
2857 /**
2858 * Extract the exponent of a IEEE-754 floating point value.
2859 *
2860 * Optionally apply an integer bias.
2861 *
2862 * Result is an integer value with
2863 *
2864 * ifloor(log2(x)) + bias
2865 */
2866 LLVMValueRef
2867 lp_build_extract_exponent(struct lp_build_context *bld,
2868 LLVMValueRef x,
2869 int bias)
2870 {
2871 LLVMBuilderRef builder = bld->gallivm->builder;
2872 const struct lp_type type = bld->type;
2873 unsigned mantissa = lp_mantissa(type);
2874 LLVMValueRef res;
2875
2876 assert(type.floating);
2877
2878 assert(lp_check_value(bld->type, x));
2879
2880 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2881
2882 res = LLVMBuildLShr(builder, x,
2883 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2884 res = LLVMBuildAnd(builder, res,
2885 lp_build_const_int_vec(bld->gallivm, type, 255), "");
2886 res = LLVMBuildSub(builder, res,
2887 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2888
2889 return res;
2890 }
2891
2892
2893 /**
2894 * Extract the mantissa of the a floating.
2895 *
2896 * Result is a floating point value with
2897 *
2898 * x / floor(log2(x))
2899 */
2900 LLVMValueRef
2901 lp_build_extract_mantissa(struct lp_build_context *bld,
2902 LLVMValueRef x)
2903 {
2904 LLVMBuilderRef builder = bld->gallivm->builder;
2905 const struct lp_type type = bld->type;
2906 unsigned mantissa = lp_mantissa(type);
2907 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2908 (1ULL << mantissa) - 1);
2909 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2910 LLVMValueRef res;
2911
2912 assert(lp_check_value(bld->type, x));
2913
2914 assert(type.floating);
2915
2916 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2917
2918 /* res = x / 2**ipart */
2919 res = LLVMBuildAnd(builder, x, mantmask, "");
2920 res = LLVMBuildOr(builder, res, one, "");
2921 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2922
2923 return res;
2924 }
2925
2926
2927
2928 /**
2929 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2930 * These coefficients can be generate with
2931 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2932 */
2933 const double lp_build_log2_polynomial[] = {
2934 #if LOG_POLY_DEGREE == 5
2935 2.88539008148777786488L,
2936 0.961796878841293367824L,
2937 0.577058946784739859012L,
2938 0.412914355135828735411L,
2939 0.308591899232910175289L,
2940 0.352376952300281371868L,
2941 #elif LOG_POLY_DEGREE == 4
2942 2.88539009343309178325L,
2943 0.961791550404184197881L,
2944 0.577440339438736392009L,
2945 0.403343858251329912514L,
2946 0.406718052498846252698L,
2947 #elif LOG_POLY_DEGREE == 3
2948 2.88538959748872753838L,
2949 0.961932915889597772928L,
2950 0.571118517972136195241L,
2951 0.493997535084709500285L,
2952 #else
2953 #error
2954 #endif
2955 };
2956
2957 /**
2958 * See http://www.devmaster.net/forums/showthread.php?p=43580
2959 * http://en.wikipedia.org/wiki/Logarithm#Calculation
2960 * http://www.nezumi.demon.co.uk/consult/logx.htm
2961 */
2962 void
2963 lp_build_log2_approx(struct lp_build_context *bld,
2964 LLVMValueRef x,
2965 LLVMValueRef *p_exp,
2966 LLVMValueRef *p_floor_log2,
2967 LLVMValueRef *p_log2)
2968 {
2969 LLVMBuilderRef builder = bld->gallivm->builder;
2970 const struct lp_type type = bld->type;
2971 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2972 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2973
2974 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2975 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2976 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2977
2978 LLVMValueRef i = NULL;
2979 LLVMValueRef y = NULL;
2980 LLVMValueRef z = NULL;
2981 LLVMValueRef exp = NULL;
2982 LLVMValueRef mant = NULL;
2983 LLVMValueRef logexp = NULL;
2984 LLVMValueRef logmant = NULL;
2985 LLVMValueRef res = NULL;
2986
2987 assert(lp_check_value(bld->type, x));
2988
2989 if(p_exp || p_floor_log2 || p_log2) {
2990 /* TODO: optimize the constant case */
2991 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2992 LLVMIsConstant(x)) {
2993 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2994 __FUNCTION__);
2995 }
2996
2997 assert(type.floating && type.width == 32);
2998
2999 /*
3000 * We don't explicitly handle denormalized numbers. They will yield a
3001 * result in the neighbourhood of -127, which appears to be adequate
3002 * enough.
3003 */
3004
3005 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3006
3007 /* exp = (float) exponent(x) */
3008 exp = LLVMBuildAnd(builder, i, expmask, "");
3009 }
3010
3011 if(p_floor_log2 || p_log2) {
3012 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3013 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3014 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3015 }
3016
3017 if(p_log2) {
3018 /* mant = 1 + (float) mantissa(x) */
3019 mant = LLVMBuildAnd(builder, i, mantmask, "");
3020 mant = LLVMBuildOr(builder, mant, one, "");
3021 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3022
3023 /* y = (mant - 1) / (mant + 1) */
3024 y = lp_build_div(bld,
3025 lp_build_sub(bld, mant, bld->one),
3026 lp_build_add(bld, mant, bld->one)
3027 );
3028
3029 /* z = y^2 */
3030 z = lp_build_mul(bld, y, y);
3031
3032 /* compute P(z) */
3033 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3034 Elements(lp_build_log2_polynomial));
3035
3036 /* logmant = y * P(z) */
3037 logmant = lp_build_mul(bld, y, logmant);
3038
3039 res = lp_build_add(bld, logmant, logexp);
3040 }
3041
3042 if(p_exp) {
3043 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3044 *p_exp = exp;
3045 }
3046
3047 if(p_floor_log2)
3048 *p_floor_log2 = logexp;
3049
3050 if(p_log2)
3051 *p_log2 = res;
3052 }
3053
3054
3055 LLVMValueRef
3056 lp_build_log2(struct lp_build_context *bld,
3057 LLVMValueRef x)
3058 {
3059 LLVMValueRef res;
3060 lp_build_log2_approx(bld, x, NULL, NULL, &res);
3061 return res;
3062 }
3063
3064
3065 /**
3066 * Faster (and less accurate) log2.
3067 *
3068 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3069 *
3070 * Piece-wise linear approximation, with exact results when x is a
3071 * power of two.
3072 *
3073 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3074 */
3075 LLVMValueRef
3076 lp_build_fast_log2(struct lp_build_context *bld,
3077 LLVMValueRef x)
3078 {
3079 LLVMBuilderRef builder = bld->gallivm->builder;
3080 LLVMValueRef ipart;
3081 LLVMValueRef fpart;
3082
3083 assert(lp_check_value(bld->type, x));
3084
3085 assert(bld->type.floating);
3086
3087 /* ipart = floor(log2(x)) - 1 */
3088 ipart = lp_build_extract_exponent(bld, x, -1);
3089 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3090
3091 /* fpart = x / 2**ipart */
3092 fpart = lp_build_extract_mantissa(bld, x);
3093
3094 /* ipart + fpart */
3095 return LLVMBuildFAdd(builder, ipart, fpart, "");
3096 }
3097
3098
3099 /**
3100 * Fast implementation of iround(log2(x)).
3101 *
3102 * Not an approximation -- it should give accurate results all the time.
3103 */
3104 LLVMValueRef
3105 lp_build_ilog2(struct lp_build_context *bld,
3106 LLVMValueRef x)
3107 {
3108 LLVMBuilderRef builder = bld->gallivm->builder;
3109 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3110 LLVMValueRef ipart;
3111
3112 assert(bld->type.floating);
3113
3114 assert(lp_check_value(bld->type, x));
3115
3116 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
3117 x = LLVMBuildFMul(builder, x, sqrt2, "");
3118
3119 /* ipart = floor(log2(x) + 0.5) */
3120 ipart = lp_build_extract_exponent(bld, x, 0);
3121
3122 return ipart;
3123 }
3124
3125 LLVMValueRef
3126 lp_build_mod(struct lp_build_context *bld,
3127 LLVMValueRef x,
3128 LLVMValueRef y)
3129 {
3130 LLVMBuilderRef builder = bld->gallivm->builder;
3131 LLVMValueRef res;
3132 const struct lp_type type = bld->type;
3133
3134 assert(lp_check_value(type, x));
3135 assert(lp_check_value(type, y));
3136
3137 if (type.floating)
3138 res = LLVMBuildFRem(builder, x, y, "");
3139 else if (type.sign)
3140 res = LLVMBuildSRem(builder, x, y, "");
3141 else
3142 res = LLVMBuildURem(builder, x, y, "");
3143 return res;
3144 }