gallivm: Always use floating-point operators for floating-point types
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
60
61
62 /**
63 * Generate min(a, b)
64 * No checks for special case values of a or b = 1 or 0 are done.
65 */
66 static LLVMValueRef
67 lp_build_min_simple(struct lp_build_context *bld,
68 LLVMValueRef a,
69 LLVMValueRef b)
70 {
71 const struct lp_type type = bld->type;
72 const char *intrinsic = NULL;
73 LLVMValueRef cond;
74
75 /* TODO: optimize the constant case */
76
77 if(type.width * type.length == 128) {
78 if(type.floating) {
79 if(type.width == 32 && util_cpu_caps.has_sse)
80 intrinsic = "llvm.x86.sse.min.ps";
81 if(type.width == 64 && util_cpu_caps.has_sse2)
82 intrinsic = "llvm.x86.sse2.min.pd";
83 }
84 else {
85 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
86 intrinsic = "llvm.x86.sse2.pminu.b";
87 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
88 intrinsic = "llvm.x86.sse41.pminsb";
89 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
90 intrinsic = "llvm.x86.sse41.pminuw";
91 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
92 intrinsic = "llvm.x86.sse2.pmins.w";
93 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
94 intrinsic = "llvm.x86.sse41.pminud";
95 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
96 intrinsic = "llvm.x86.sse41.pminsd";
97 }
98 }
99
100 if(intrinsic)
101 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
102
103 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
104 return lp_build_select(bld, cond, a, b);
105 }
106
107
108 /**
109 * Generate max(a, b)
110 * No checks for special case values of a or b = 1 or 0 are done.
111 */
112 static LLVMValueRef
113 lp_build_max_simple(struct lp_build_context *bld,
114 LLVMValueRef a,
115 LLVMValueRef b)
116 {
117 const struct lp_type type = bld->type;
118 const char *intrinsic = NULL;
119 LLVMValueRef cond;
120
121 /* TODO: optimize the constant case */
122
123 if(type.width * type.length == 128) {
124 if(type.floating) {
125 if(type.width == 32 && util_cpu_caps.has_sse)
126 intrinsic = "llvm.x86.sse.max.ps";
127 if(type.width == 64 && util_cpu_caps.has_sse2)
128 intrinsic = "llvm.x86.sse2.max.pd";
129 }
130 else {
131 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
132 intrinsic = "llvm.x86.sse2.pmaxu.b";
133 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
134 intrinsic = "llvm.x86.sse41.pmaxsb";
135 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
136 intrinsic = "llvm.x86.sse41.pmaxuw";
137 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
138 intrinsic = "llvm.x86.sse2.pmaxs.w";
139 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
140 intrinsic = "llvm.x86.sse41.pmaxud";
141 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
142 intrinsic = "llvm.x86.sse41.pmaxsd";
143 }
144 }
145
146 if(intrinsic)
147 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
148
149 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
150 return lp_build_select(bld, cond, a, b);
151 }
152
153
154 /**
155 * Generate 1 - a, or ~a depending on bld->type.
156 */
157 LLVMValueRef
158 lp_build_comp(struct lp_build_context *bld,
159 LLVMValueRef a)
160 {
161 const struct lp_type type = bld->type;
162
163 if(a == bld->one)
164 return bld->zero;
165 if(a == bld->zero)
166 return bld->one;
167
168 if(type.norm && !type.floating && !type.fixed && !type.sign) {
169 if(LLVMIsConstant(a))
170 return LLVMConstNot(a);
171 else
172 return LLVMBuildNot(bld->builder, a, "");
173 }
174
175 if(LLVMIsConstant(a))
176 if (type.floating)
177 return LLVMConstFSub(bld->one, a);
178 else
179 return LLVMConstSub(bld->one, a);
180 else
181 if (type.floating)
182 return LLVMBuildFSub(bld->builder, bld->one, a, "");
183 else
184 return LLVMBuildSub(bld->builder, bld->one, a, "");
185 }
186
187
188 /**
189 * Generate a + b
190 */
191 LLVMValueRef
192 lp_build_add(struct lp_build_context *bld,
193 LLVMValueRef a,
194 LLVMValueRef b)
195 {
196 const struct lp_type type = bld->type;
197 LLVMValueRef res;
198
199 assert(lp_check_value(type, a));
200 assert(lp_check_value(type, b));
201
202 if(a == bld->zero)
203 return b;
204 if(b == bld->zero)
205 return a;
206 if(a == bld->undef || b == bld->undef)
207 return bld->undef;
208
209 if(bld->type.norm) {
210 const char *intrinsic = NULL;
211
212 if(a == bld->one || b == bld->one)
213 return bld->one;
214
215 if(util_cpu_caps.has_sse2 &&
216 type.width * type.length == 128 &&
217 !type.floating && !type.fixed) {
218 if(type.width == 8)
219 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
220 if(type.width == 16)
221 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
222 }
223
224 if(intrinsic)
225 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
226 }
227
228 if(LLVMIsConstant(a) && LLVMIsConstant(b))
229 if (type.floating)
230 res = LLVMConstFAdd(a, b);
231 else
232 res = LLVMConstAdd(a, b);
233 else
234 if (type.floating)
235 res = LLVMBuildFAdd(bld->builder, a, b, "");
236 else
237 res = LLVMBuildAdd(bld->builder, a, b, "");
238
239 /* clamp to ceiling of 1.0 */
240 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
241 res = lp_build_min_simple(bld, res, bld->one);
242
243 /* XXX clamp to floor of -1 or 0??? */
244
245 return res;
246 }
247
248
249 /** Return the sum of the elements of a */
250 LLVMValueRef
251 lp_build_sum_vector(struct lp_build_context *bld,
252 LLVMValueRef a)
253 {
254 const struct lp_type type = bld->type;
255 LLVMValueRef index, res;
256 unsigned i;
257
258 if (a == bld->zero)
259 return bld->zero;
260 if (a == bld->undef)
261 return bld->undef;
262 assert(type.length > 1);
263
264 assert(!bld->type.norm);
265
266 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
267 res = LLVMBuildExtractElement(bld->builder, a, index, "");
268
269 for (i = 1; i < type.length; i++) {
270 index = LLVMConstInt(LLVMInt32Type(), i, 0);
271 if (type.floating)
272 res = LLVMBuildFAdd(bld->builder, res,
273 LLVMBuildExtractElement(bld->builder,
274 a, index, ""),
275 "");
276 else
277 res = LLVMBuildAdd(bld->builder, res,
278 LLVMBuildExtractElement(bld->builder,
279 a, index, ""),
280 "");
281 }
282
283 return res;
284 }
285
286
287 /**
288 * Generate a - b
289 */
290 LLVMValueRef
291 lp_build_sub(struct lp_build_context *bld,
292 LLVMValueRef a,
293 LLVMValueRef b)
294 {
295 const struct lp_type type = bld->type;
296 LLVMValueRef res;
297
298 assert(lp_check_value(type, a));
299 assert(lp_check_value(type, b));
300
301 if(b == bld->zero)
302 return a;
303 if(a == bld->undef || b == bld->undef)
304 return bld->undef;
305 if(a == b)
306 return bld->zero;
307
308 if(bld->type.norm) {
309 const char *intrinsic = NULL;
310
311 if(b == bld->one)
312 return bld->zero;
313
314 if(util_cpu_caps.has_sse2 &&
315 type.width * type.length == 128 &&
316 !type.floating && !type.fixed) {
317 if(type.width == 8)
318 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
319 if(type.width == 16)
320 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
321 }
322
323 if(intrinsic)
324 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
325 }
326
327 if(LLVMIsConstant(a) && LLVMIsConstant(b))
328 if (type.floating)
329 res = LLVMConstFSub(a, b);
330 else
331 res = LLVMConstSub(a, b);
332 else
333 if (type.floating)
334 res = LLVMBuildFSub(bld->builder, a, b, "");
335 else
336 res = LLVMBuildSub(bld->builder, a, b, "");
337
338 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
339 res = lp_build_max_simple(bld, res, bld->zero);
340
341 return res;
342 }
343
344
345 /**
346 * Normalized 8bit multiplication.
347 *
348 * - alpha plus one
349 *
350 * makes the following approximation to the division (Sree)
351 *
352 * a*b/255 ~= (a*(b + 1)) >> 256
353 *
354 * which is the fastest method that satisfies the following OpenGL criteria
355 *
356 * 0*0 = 0 and 255*255 = 255
357 *
358 * - geometric series
359 *
360 * takes the geometric series approximation to the division
361 *
362 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
363 *
364 * in this case just the first two terms to fit in 16bit arithmetic
365 *
366 * t/255 ~= (t + (t >> 8)) >> 8
367 *
368 * note that just by itself it doesn't satisfies the OpenGL criteria, as
369 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
370 * must be used
371 *
372 * - geometric series plus rounding
373 *
374 * when using a geometric series division instead of truncating the result
375 * use roundoff in the approximation (Jim Blinn)
376 *
377 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
378 *
379 * achieving the exact results
380 *
381 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
382 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
383 * @sa Michael Herf, The "double blend trick", May 2000,
384 * http://www.stereopsis.com/doubleblend.html
385 */
386 static LLVMValueRef
387 lp_build_mul_u8n(LLVMBuilderRef builder,
388 struct lp_type i16_type,
389 LLVMValueRef a, LLVMValueRef b)
390 {
391 LLVMValueRef c8;
392 LLVMValueRef ab;
393
394 c8 = lp_build_const_int_vec(i16_type, 8);
395
396 #if 0
397
398 /* a*b/255 ~= (a*(b + 1)) >> 256 */
399 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
400 ab = LLVMBuildMul(builder, a, b, "");
401
402 #else
403
404 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
405 ab = LLVMBuildMul(builder, a, b, "");
406 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
407 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
408
409 #endif
410
411 ab = LLVMBuildLShr(builder, ab, c8, "");
412
413 return ab;
414 }
415
416
417 /**
418 * Generate a * b
419 */
420 LLVMValueRef
421 lp_build_mul(struct lp_build_context *bld,
422 LLVMValueRef a,
423 LLVMValueRef b)
424 {
425 const struct lp_type type = bld->type;
426 LLVMValueRef shift;
427 LLVMValueRef res;
428
429 assert(lp_check_value(type, a));
430 assert(lp_check_value(type, b));
431
432 if(a == bld->zero)
433 return bld->zero;
434 if(a == bld->one)
435 return b;
436 if(b == bld->zero)
437 return bld->zero;
438 if(b == bld->one)
439 return a;
440 if(a == bld->undef || b == bld->undef)
441 return bld->undef;
442
443 if(!type.floating && !type.fixed && type.norm) {
444 if(type.width == 8) {
445 struct lp_type i16_type = lp_wider_type(type);
446 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
447
448 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
449 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
450
451 /* PMULLW, PSRLW, PADDW */
452 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
453 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
454
455 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
456
457 return ab;
458 }
459
460 /* FIXME */
461 assert(0);
462 }
463
464 if(type.fixed)
465 shift = lp_build_const_int_vec(type, type.width/2);
466 else
467 shift = NULL;
468
469 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
470 if (type.floating)
471 res = LLVMConstFMul(a, b);
472 else
473 res = LLVMConstMul(a, b);
474 if(shift) {
475 if(type.sign)
476 res = LLVMConstAShr(res, shift);
477 else
478 res = LLVMConstLShr(res, shift);
479 }
480 }
481 else {
482 if (type.floating)
483 res = LLVMBuildFMul(bld->builder, a, b, "");
484 else
485 res = LLVMBuildMul(bld->builder, a, b, "");
486 if(shift) {
487 if(type.sign)
488 res = LLVMBuildAShr(bld->builder, res, shift, "");
489 else
490 res = LLVMBuildLShr(bld->builder, res, shift, "");
491 }
492 }
493
494 return res;
495 }
496
497
498 /**
499 * Small vector x scale multiplication optimization.
500 */
501 LLVMValueRef
502 lp_build_mul_imm(struct lp_build_context *bld,
503 LLVMValueRef a,
504 int b)
505 {
506 LLVMValueRef factor;
507
508 if(b == 0)
509 return bld->zero;
510
511 if(b == 1)
512 return a;
513
514 if(b == -1)
515 if (bld->type.floating)
516 return LLVMBuildFNeg(bld->builder, a, "");
517 else
518 return LLVMBuildNeg(bld->builder, a, "");
519
520 if(b == 2 && bld->type.floating)
521 return lp_build_add(bld, a, a);
522
523 if(util_is_pot(b)) {
524 unsigned shift = ffs(b) - 1;
525
526 if(bld->type.floating) {
527 #if 0
528 /*
529 * Power of two multiplication by directly manipulating the mantissa.
530 *
531 * XXX: This might not be always faster, it will introduce a small error
532 * for multiplication by zero, and it will produce wrong results
533 * for Inf and NaN.
534 */
535 unsigned mantissa = lp_mantissa(bld->type);
536 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
537 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
538 a = LLVMBuildAdd(bld->builder, a, factor, "");
539 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
540 return a;
541 #endif
542 }
543 else {
544 factor = lp_build_const_vec(bld->type, shift);
545 return LLVMBuildShl(bld->builder, a, factor, "");
546 }
547 }
548
549 factor = lp_build_const_vec(bld->type, (double)b);
550 return lp_build_mul(bld, a, factor);
551 }
552
553
554 /**
555 * Generate a / b
556 */
557 LLVMValueRef
558 lp_build_div(struct lp_build_context *bld,
559 LLVMValueRef a,
560 LLVMValueRef b)
561 {
562 const struct lp_type type = bld->type;
563
564 assert(lp_check_value(type, a));
565 assert(lp_check_value(type, b));
566
567 if(a == bld->zero)
568 return bld->zero;
569 if(a == bld->one)
570 return lp_build_rcp(bld, b);
571 if(b == bld->zero)
572 return bld->undef;
573 if(b == bld->one)
574 return a;
575 if(a == bld->undef || b == bld->undef)
576 return bld->undef;
577
578 if(LLVMIsConstant(a) && LLVMIsConstant(b))
579 return LLVMConstFDiv(a, b);
580
581 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
582 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
583
584 return LLVMBuildFDiv(bld->builder, a, b, "");
585 }
586
587
588 /**
589 * Linear interpolation.
590 *
591 * This also works for integer values with a few caveats.
592 *
593 * @sa http://www.stereopsis.com/doubleblend.html
594 */
595 LLVMValueRef
596 lp_build_lerp(struct lp_build_context *bld,
597 LLVMValueRef x,
598 LLVMValueRef v0,
599 LLVMValueRef v1)
600 {
601 LLVMValueRef delta;
602 LLVMValueRef res;
603
604 delta = lp_build_sub(bld, v1, v0);
605
606 res = lp_build_mul(bld, x, delta);
607
608 res = lp_build_add(bld, v0, res);
609
610 if(bld->type.fixed)
611 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
612 * but it will be wrong for other uses. Basically we need a more
613 * powerful lp_type, capable of further distinguishing the values
614 * interpretation from the value storage. */
615 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
616
617 return res;
618 }
619
620
621 LLVMValueRef
622 lp_build_lerp_2d(struct lp_build_context *bld,
623 LLVMValueRef x,
624 LLVMValueRef y,
625 LLVMValueRef v00,
626 LLVMValueRef v01,
627 LLVMValueRef v10,
628 LLVMValueRef v11)
629 {
630 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
631 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
632 return lp_build_lerp(bld, y, v0, v1);
633 }
634
635
636 /**
637 * Generate min(a, b)
638 * Do checks for special cases.
639 */
640 LLVMValueRef
641 lp_build_min(struct lp_build_context *bld,
642 LLVMValueRef a,
643 LLVMValueRef b)
644 {
645 if(a == bld->undef || b == bld->undef)
646 return bld->undef;
647
648 if(a == b)
649 return a;
650
651 if(bld->type.norm) {
652 if(a == bld->zero || b == bld->zero)
653 return bld->zero;
654 if(a == bld->one)
655 return b;
656 if(b == bld->one)
657 return a;
658 }
659
660 return lp_build_min_simple(bld, a, b);
661 }
662
663
664 /**
665 * Generate max(a, b)
666 * Do checks for special cases.
667 */
668 LLVMValueRef
669 lp_build_max(struct lp_build_context *bld,
670 LLVMValueRef a,
671 LLVMValueRef b)
672 {
673 if(a == bld->undef || b == bld->undef)
674 return bld->undef;
675
676 if(a == b)
677 return a;
678
679 if(bld->type.norm) {
680 if(a == bld->one || b == bld->one)
681 return bld->one;
682 if(a == bld->zero)
683 return b;
684 if(b == bld->zero)
685 return a;
686 }
687
688 return lp_build_max_simple(bld, a, b);
689 }
690
691
692 /**
693 * Generate clamp(a, min, max)
694 * Do checks for special cases.
695 */
696 LLVMValueRef
697 lp_build_clamp(struct lp_build_context *bld,
698 LLVMValueRef a,
699 LLVMValueRef min,
700 LLVMValueRef max)
701 {
702 a = lp_build_min(bld, a, max);
703 a = lp_build_max(bld, a, min);
704 return a;
705 }
706
707
708 /**
709 * Generate abs(a)
710 */
711 LLVMValueRef
712 lp_build_abs(struct lp_build_context *bld,
713 LLVMValueRef a)
714 {
715 const struct lp_type type = bld->type;
716 LLVMTypeRef vec_type = lp_build_vec_type(type);
717
718 if(!type.sign)
719 return a;
720
721 if(type.floating) {
722 /* Mask out the sign bit */
723 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
724 unsigned long long absMask = ~(1ULL << (type.width - 1));
725 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
726 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
727 a = LLVMBuildAnd(bld->builder, a, mask, "");
728 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
729 return a;
730 }
731
732 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
733 switch(type.width) {
734 case 8:
735 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
736 case 16:
737 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
738 case 32:
739 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
740 }
741 }
742
743 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
744 }
745
746
747 LLVMValueRef
748 lp_build_negate(struct lp_build_context *bld,
749 LLVMValueRef a)
750 {
751 if (bld->type.floating)
752 a = LLVMBuildFNeg(bld->builder, a, "");
753 else
754 a = LLVMBuildNeg(bld->builder, a, "");
755
756 return a;
757 }
758
759
760 /** Return -1, 0 or +1 depending on the sign of a */
761 LLVMValueRef
762 lp_build_sgn(struct lp_build_context *bld,
763 LLVMValueRef a)
764 {
765 const struct lp_type type = bld->type;
766 LLVMValueRef cond;
767 LLVMValueRef res;
768
769 /* Handle non-zero case */
770 if(!type.sign) {
771 /* if not zero then sign must be positive */
772 res = bld->one;
773 }
774 else if(type.floating) {
775 LLVMTypeRef vec_type;
776 LLVMTypeRef int_type;
777 LLVMValueRef mask;
778 LLVMValueRef sign;
779 LLVMValueRef one;
780 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
781
782 int_type = lp_build_int_vec_type(type);
783 vec_type = lp_build_vec_type(type);
784 mask = lp_build_const_int_vec(type, maskBit);
785
786 /* Take the sign bit and add it to 1 constant */
787 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
788 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
789 one = LLVMConstBitCast(bld->one, int_type);
790 res = LLVMBuildOr(bld->builder, sign, one, "");
791 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
792 }
793 else
794 {
795 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
796 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
797 res = lp_build_select(bld, cond, bld->one, minus_one);
798 }
799
800 /* Handle zero */
801 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
802 res = lp_build_select(bld, cond, bld->zero, res);
803
804 return res;
805 }
806
807
808 /**
809 * Set the sign of float vector 'a' according to 'sign'.
810 * If sign==0, return abs(a).
811 * If sign==1, return -abs(a);
812 * Other values for sign produce undefined results.
813 */
814 LLVMValueRef
815 lp_build_set_sign(struct lp_build_context *bld,
816 LLVMValueRef a, LLVMValueRef sign)
817 {
818 const struct lp_type type = bld->type;
819 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
820 LLVMTypeRef vec_type = lp_build_vec_type(type);
821 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
822 LLVMValueRef mask = lp_build_const_int_vec(type,
823 ~((unsigned long long) 1 << (type.width - 1)));
824 LLVMValueRef val, res;
825
826 assert(type.floating);
827
828 /* val = reinterpret_cast<int>(a) */
829 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
830 /* val = val & mask */
831 val = LLVMBuildAnd(bld->builder, val, mask, "");
832 /* sign = sign << shift */
833 sign = LLVMBuildShl(bld->builder, sign, shift, "");
834 /* res = val | sign */
835 res = LLVMBuildOr(bld->builder, val, sign, "");
836 /* res = reinterpret_cast<float>(res) */
837 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
838
839 return res;
840 }
841
842
843 /**
844 * Convert vector of (or scalar) int to vector of (or scalar) float.
845 */
846 LLVMValueRef
847 lp_build_int_to_float(struct lp_build_context *bld,
848 LLVMValueRef a)
849 {
850 const struct lp_type type = bld->type;
851 LLVMTypeRef vec_type = lp_build_vec_type(type);
852
853 assert(type.floating);
854
855 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
856 }
857
858
859
860 enum lp_build_round_sse41_mode
861 {
862 LP_BUILD_ROUND_SSE41_NEAREST = 0,
863 LP_BUILD_ROUND_SSE41_FLOOR = 1,
864 LP_BUILD_ROUND_SSE41_CEIL = 2,
865 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
866 };
867
868
869 static INLINE LLVMValueRef
870 lp_build_round_sse41(struct lp_build_context *bld,
871 LLVMValueRef a,
872 enum lp_build_round_sse41_mode mode)
873 {
874 const struct lp_type type = bld->type;
875 LLVMTypeRef vec_type = lp_build_vec_type(type);
876 const char *intrinsic;
877
878 assert(type.floating);
879 assert(type.width*type.length == 128);
880 assert(lp_check_value(type, a));
881 assert(util_cpu_caps.has_sse4_1);
882
883 switch(type.width) {
884 case 32:
885 intrinsic = "llvm.x86.sse41.round.ps";
886 break;
887 case 64:
888 intrinsic = "llvm.x86.sse41.round.pd";
889 break;
890 default:
891 assert(0);
892 return bld->undef;
893 }
894
895 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
896 LLVMConstInt(LLVMInt32Type(), mode, 0));
897 }
898
899
900 /**
901 * Return the integer part of a float (vector) value. The returned value is
902 * a float (vector).
903 * Ex: trunc(-1.5) = 1.0
904 */
905 LLVMValueRef
906 lp_build_trunc(struct lp_build_context *bld,
907 LLVMValueRef a)
908 {
909 const struct lp_type type = bld->type;
910
911 assert(type.floating);
912 assert(lp_check_value(type, a));
913
914 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
915 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
916 else {
917 LLVMTypeRef vec_type = lp_build_vec_type(type);
918 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
919 LLVMValueRef res;
920 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
921 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
922 return res;
923 }
924 }
925
926
927 /**
928 * Return float (vector) rounded to nearest integer (vector). The returned
929 * value is a float (vector).
930 * Ex: round(0.9) = 1.0
931 * Ex: round(-1.5) = -2.0
932 */
933 LLVMValueRef
934 lp_build_round(struct lp_build_context *bld,
935 LLVMValueRef a)
936 {
937 const struct lp_type type = bld->type;
938
939 assert(type.floating);
940 assert(lp_check_value(type, a));
941
942 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
943 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
944 else {
945 LLVMTypeRef vec_type = lp_build_vec_type(type);
946 LLVMValueRef res;
947 res = lp_build_iround(bld, a);
948 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
949 return res;
950 }
951 }
952
953
954 /**
955 * Return floor of float (vector), result is a float (vector)
956 * Ex: floor(1.1) = 1.0
957 * Ex: floor(-1.1) = -2.0
958 */
959 LLVMValueRef
960 lp_build_floor(struct lp_build_context *bld,
961 LLVMValueRef a)
962 {
963 const struct lp_type type = bld->type;
964
965 assert(type.floating);
966 assert(lp_check_value(type, a));
967
968 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
969 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
970 else {
971 LLVMTypeRef vec_type = lp_build_vec_type(type);
972 LLVMValueRef res;
973 res = lp_build_ifloor(bld, a);
974 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
975 return res;
976 }
977 }
978
979
980 /**
981 * Return ceiling of float (vector), returning float (vector).
982 * Ex: ceil( 1.1) = 2.0
983 * Ex: ceil(-1.1) = -1.0
984 */
985 LLVMValueRef
986 lp_build_ceil(struct lp_build_context *bld,
987 LLVMValueRef a)
988 {
989 const struct lp_type type = bld->type;
990
991 assert(type.floating);
992 assert(lp_check_value(type, a));
993
994 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
995 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
996 else {
997 LLVMTypeRef vec_type = lp_build_vec_type(type);
998 LLVMValueRef res;
999 res = lp_build_iceil(bld, a);
1000 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1001 return res;
1002 }
1003 }
1004
1005
1006 /**
1007 * Return fractional part of 'a' computed as a - floor(a)
1008 * Typically used in texture coord arithmetic.
1009 */
1010 LLVMValueRef
1011 lp_build_fract(struct lp_build_context *bld,
1012 LLVMValueRef a)
1013 {
1014 assert(bld->type.floating);
1015 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1016 }
1017
1018
1019 /**
1020 * Return the integer part of a float (vector) value. The returned value is
1021 * an integer (vector).
1022 * Ex: itrunc(-1.5) = 1
1023 */
1024 LLVMValueRef
1025 lp_build_itrunc(struct lp_build_context *bld,
1026 LLVMValueRef a)
1027 {
1028 const struct lp_type type = bld->type;
1029 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1030
1031 assert(type.floating);
1032 assert(lp_check_value(type, a));
1033
1034 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1035 }
1036
1037
1038 /**
1039 * Return float (vector) rounded to nearest integer (vector). The returned
1040 * value is an integer (vector).
1041 * Ex: iround(0.9) = 1
1042 * Ex: iround(-1.5) = -2
1043 */
1044 LLVMValueRef
1045 lp_build_iround(struct lp_build_context *bld,
1046 LLVMValueRef a)
1047 {
1048 const struct lp_type type = bld->type;
1049 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1050 LLVMValueRef res;
1051
1052 assert(type.floating);
1053
1054 assert(lp_check_value(type, a));
1055
1056 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1057 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1058 }
1059 else {
1060 LLVMTypeRef vec_type = lp_build_vec_type(type);
1061 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1062 LLVMValueRef sign;
1063 LLVMValueRef half;
1064
1065 /* get sign bit */
1066 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1067 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1068
1069 /* sign * 0.5 */
1070 half = lp_build_const_vec(type, 0.5);
1071 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1072 half = LLVMBuildOr(bld->builder, sign, half, "");
1073 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1074
1075 res = LLVMBuildFAdd(bld->builder, a, half, "");
1076 }
1077
1078 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1079
1080 return res;
1081 }
1082
1083
1084 /**
1085 * Return floor of float (vector), result is an int (vector)
1086 * Ex: ifloor(1.1) = 1.0
1087 * Ex: ifloor(-1.1) = -2.0
1088 */
1089 LLVMValueRef
1090 lp_build_ifloor(struct lp_build_context *bld,
1091 LLVMValueRef a)
1092 {
1093 const struct lp_type type = bld->type;
1094 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1095 LLVMValueRef res;
1096
1097 assert(type.floating);
1098 assert(lp_check_value(type, a));
1099
1100 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1101 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1102 }
1103 else {
1104 /* Take the sign bit and add it to 1 constant */
1105 LLVMTypeRef vec_type = lp_build_vec_type(type);
1106 unsigned mantissa = lp_mantissa(type);
1107 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1108 LLVMValueRef sign;
1109 LLVMValueRef offset;
1110
1111 /* sign = a < 0 ? ~0 : 0 */
1112 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1113 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1114 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1115
1116 /* offset = -0.99999(9)f */
1117 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1118 offset = LLVMConstBitCast(offset, int_vec_type);
1119
1120 /* offset = a < 0 ? offset : 0.0f */
1121 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1122 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1123
1124 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1125 }
1126
1127 /* round to nearest (toward zero) */
1128 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1129
1130 return res;
1131 }
1132
1133
1134 /**
1135 * Return ceiling of float (vector), returning int (vector).
1136 * Ex: iceil( 1.1) = 2
1137 * Ex: iceil(-1.1) = -1
1138 */
1139 LLVMValueRef
1140 lp_build_iceil(struct lp_build_context *bld,
1141 LLVMValueRef a)
1142 {
1143 const struct lp_type type = bld->type;
1144 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1145 LLVMValueRef res;
1146
1147 assert(type.floating);
1148 assert(lp_check_value(type, a));
1149
1150 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1151 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1152 }
1153 else {
1154 LLVMTypeRef vec_type = lp_build_vec_type(type);
1155 unsigned mantissa = lp_mantissa(type);
1156 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1157 LLVMValueRef sign;
1158 LLVMValueRef offset;
1159
1160 /* sign = a < 0 ? 0 : ~0 */
1161 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1162 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1163 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1164 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1165
1166 /* offset = 0.99999(9)f */
1167 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1168 offset = LLVMConstBitCast(offset, int_vec_type);
1169
1170 /* offset = a < 0 ? 0.0 : offset */
1171 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1172 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1173
1174 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1175 }
1176
1177 /* round to nearest (toward zero) */
1178 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1179
1180 return res;
1181 }
1182
1183
1184 LLVMValueRef
1185 lp_build_sqrt(struct lp_build_context *bld,
1186 LLVMValueRef a)
1187 {
1188 const struct lp_type type = bld->type;
1189 LLVMTypeRef vec_type = lp_build_vec_type(type);
1190 char intrinsic[32];
1191
1192 /* TODO: optimize the constant case */
1193 /* TODO: optimize the constant case */
1194
1195 assert(type.floating);
1196 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1197
1198 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1199 }
1200
1201
1202 LLVMValueRef
1203 lp_build_rcp(struct lp_build_context *bld,
1204 LLVMValueRef a)
1205 {
1206 const struct lp_type type = bld->type;
1207
1208 if(a == bld->zero)
1209 return bld->undef;
1210 if(a == bld->one)
1211 return bld->one;
1212 if(a == bld->undef)
1213 return bld->undef;
1214
1215 assert(type.floating);
1216
1217 if(LLVMIsConstant(a))
1218 return LLVMConstFDiv(bld->one, a);
1219
1220 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1221 /*
1222 * XXX: Added precision is not always necessary, so only enable this
1223 * when we have a better system in place to track minimum precision.
1224 */
1225
1226 #if 0
1227 /*
1228 * Do one Newton-Raphson step to improve precision:
1229 *
1230 * x1 = (2 - a * rcp(a)) * rcp(a)
1231 */
1232
1233 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1234 LLVMValueRef rcp_a;
1235 LLVMValueRef res;
1236
1237 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1238
1239 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1240 res = LLVMBuildFSub(bld->builder, two, res, "");
1241 res = LLVMBuildFMul(bld->builder, res, rcp_a, "");
1242
1243 return rcp_a;
1244 #else
1245 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1246 #endif
1247 }
1248
1249 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1250 }
1251
1252
1253 /**
1254 * Generate 1/sqrt(a)
1255 */
1256 LLVMValueRef
1257 lp_build_rsqrt(struct lp_build_context *bld,
1258 LLVMValueRef a)
1259 {
1260 const struct lp_type type = bld->type;
1261
1262 assert(type.floating);
1263
1264 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1265 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1266
1267 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1268 }
1269
1270
1271 static inline LLVMValueRef
1272 lp_build_const_v4si(unsigned long value)
1273 {
1274 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1275 LLVMValueRef elements[4] = { element, element, element, element };
1276 return LLVMConstVector(elements, 4);
1277 }
1278
1279 static inline LLVMValueRef
1280 lp_build_const_v4sf(float value)
1281 {
1282 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1283 LLVMValueRef elements[4] = { element, element, element, element };
1284 return LLVMConstVector(elements, 4);
1285 }
1286
1287
1288 /**
1289 * Generate sin(a) using SSE2
1290 */
1291 LLVMValueRef
1292 lp_build_sin(struct lp_build_context *bld,
1293 LLVMValueRef a)
1294 {
1295 struct lp_type int_type = lp_int_type(bld->type);
1296 LLVMBuilderRef b = bld->builder;
1297 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1298 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1299
1300 /*
1301 * take the absolute value,
1302 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1303 */
1304
1305 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1306 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1307
1308 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1309 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1310
1311 /*
1312 * extract the sign bit (upper one)
1313 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1314 */
1315 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1316 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1317
1318 /*
1319 * scale by 4/Pi
1320 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1321 */
1322
1323 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1324 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1325
1326 /*
1327 * store the integer part of y in mm0
1328 * emm2 = _mm_cvttps_epi32(y);
1329 */
1330
1331 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1332
1333 /*
1334 * j=(j+1) & (~1) (see the cephes sources)
1335 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1336 */
1337
1338 LLVMValueRef all_one = lp_build_const_v4si(1);
1339 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1340 /*
1341 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1342 */
1343 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1344 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1345
1346 /*
1347 * y = _mm_cvtepi32_ps(emm2);
1348 */
1349 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1350
1351 /* get the swap sign flag
1352 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1353 */
1354 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1355 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1356
1357 /*
1358 * emm2 = _mm_slli_epi32(emm0, 29);
1359 */
1360 LLVMValueRef const_29 = lp_build_const_v4si(29);
1361 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1362
1363 /*
1364 * get the polynom selection mask
1365 * there is one polynom for 0 <= x <= Pi/4
1366 * and another one for Pi/4<x<=Pi/2
1367 * Both branches will be computed.
1368 *
1369 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1370 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1371 */
1372
1373 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1374 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1375 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1376 emm2_3, lp_build_const_v4si(0));
1377 /*
1378 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1379 */
1380 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1381
1382 /*
1383 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1384 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1385 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1386 */
1387 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1388 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1389 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1390
1391 /*
1392 * The magic pass: "Extended precision modular arithmetic"
1393 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1394 * xmm1 = _mm_mul_ps(y, xmm1);
1395 * xmm2 = _mm_mul_ps(y, xmm2);
1396 * xmm3 = _mm_mul_ps(y, xmm3);
1397 */
1398 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1399 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1400 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1401
1402 /*
1403 * x = _mm_add_ps(x, xmm1);
1404 * x = _mm_add_ps(x, xmm2);
1405 * x = _mm_add_ps(x, xmm3);
1406 */
1407
1408 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1409 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1410 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1411
1412 /*
1413 * Evaluate the first polynom (0 <= x <= Pi/4)
1414 *
1415 * z = _mm_mul_ps(x,x);
1416 */
1417 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1418
1419 /*
1420 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1421 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1422 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1423 */
1424 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1425 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1426 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1427
1428 /*
1429 * y = *(v4sf*)_ps_coscof_p0;
1430 * y = _mm_mul_ps(y, z);
1431 */
1432 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1433 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1434 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1435 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1436 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1437 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1438
1439
1440 /*
1441 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1442 * y = _mm_sub_ps(y, tmp);
1443 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1444 */
1445 LLVMValueRef half = lp_build_const_v4sf(0.5);
1446 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1447 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1448 LLVMValueRef one = lp_build_const_v4sf(1.0);
1449 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1450
1451 /*
1452 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1453 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1454 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1455 */
1456 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1457 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1458 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1459
1460 /*
1461 * Evaluate the second polynom (Pi/4 <= x <= 0)
1462 *
1463 * y2 = *(v4sf*)_ps_sincof_p0;
1464 * y2 = _mm_mul_ps(y2, z);
1465 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1466 * y2 = _mm_mul_ps(y2, z);
1467 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1468 * y2 = _mm_mul_ps(y2, z);
1469 * y2 = _mm_mul_ps(y2, x);
1470 * y2 = _mm_add_ps(y2, x);
1471 */
1472
1473 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1474 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1475 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1476 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1477 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1478 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1479 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1480
1481 /*
1482 * select the correct result from the two polynoms
1483 * xmm3 = poly_mask;
1484 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1485 * y = _mm_andnot_ps(xmm3, y);
1486 * y = _mm_add_ps(y,y2);
1487 */
1488 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1489 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1490 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1491 LLVMValueRef inv = lp_build_const_v4si(~0);
1492 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1493 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1494 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1495
1496 /*
1497 * update the sign
1498 * y = _mm_xor_ps(y, sign_bit);
1499 */
1500 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1501 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1502 return y_result;
1503 }
1504
1505
1506 /**
1507 * Generate cos(a) using SSE2
1508 */
1509 LLVMValueRef
1510 lp_build_cos(struct lp_build_context *bld,
1511 LLVMValueRef a)
1512 {
1513 struct lp_type int_type = lp_int_type(bld->type);
1514 LLVMBuilderRef b = bld->builder;
1515 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1516 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1517
1518 /*
1519 * take the absolute value,
1520 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1521 */
1522
1523 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1524 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1525
1526 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1527 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1528
1529 /*
1530 * scale by 4/Pi
1531 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1532 */
1533
1534 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1535 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1536
1537 /*
1538 * store the integer part of y in mm0
1539 * emm2 = _mm_cvttps_epi32(y);
1540 */
1541
1542 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1543
1544 /*
1545 * j=(j+1) & (~1) (see the cephes sources)
1546 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1547 */
1548
1549 LLVMValueRef all_one = lp_build_const_v4si(1);
1550 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1551 /*
1552 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1553 */
1554 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1555 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1556
1557 /*
1558 * y = _mm_cvtepi32_ps(emm2);
1559 */
1560 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1561
1562
1563 /*
1564 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1565 */
1566 LLVMValueRef const_2 = lp_build_const_v4si(2);
1567 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1568
1569
1570 /* get the swap sign flag
1571 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1572 */
1573 LLVMValueRef inv = lp_build_const_v4si(~0);
1574 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1575 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1576 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1577
1578 /*
1579 * emm2 = _mm_slli_epi32(emm0, 29);
1580 */
1581 LLVMValueRef const_29 = lp_build_const_v4si(29);
1582 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1583
1584 /*
1585 * get the polynom selection mask
1586 * there is one polynom for 0 <= x <= Pi/4
1587 * and another one for Pi/4<x<=Pi/2
1588 * Both branches will be computed.
1589 *
1590 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1591 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1592 */
1593
1594 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1595 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1596 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1597 emm2_3, lp_build_const_v4si(0));
1598
1599 /*
1600 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1601 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1602 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1603 */
1604 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1605 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1606 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1607
1608 /*
1609 * The magic pass: "Extended precision modular arithmetic"
1610 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1611 * xmm1 = _mm_mul_ps(y, xmm1);
1612 * xmm2 = _mm_mul_ps(y, xmm2);
1613 * xmm3 = _mm_mul_ps(y, xmm3);
1614 */
1615 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1616 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1617 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1618
1619 /*
1620 * x = _mm_add_ps(x, xmm1);
1621 * x = _mm_add_ps(x, xmm2);
1622 * x = _mm_add_ps(x, xmm3);
1623 */
1624
1625 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1626 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1627 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1628
1629 /*
1630 * Evaluate the first polynom (0 <= x <= Pi/4)
1631 *
1632 * z = _mm_mul_ps(x,x);
1633 */
1634 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1635
1636 /*
1637 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1638 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1639 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1640 */
1641 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1642 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1643 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1644
1645 /*
1646 * y = *(v4sf*)_ps_coscof_p0;
1647 * y = _mm_mul_ps(y, z);
1648 */
1649 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1650 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1651 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1652 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1653 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1654 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1655
1656
1657 /*
1658 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1659 * y = _mm_sub_ps(y, tmp);
1660 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1661 */
1662 LLVMValueRef half = lp_build_const_v4sf(0.5);
1663 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1664 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1665 LLVMValueRef one = lp_build_const_v4sf(1.0);
1666 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1667
1668 /*
1669 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1670 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1671 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1672 */
1673 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1674 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1675 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1676
1677 /*
1678 * Evaluate the second polynom (Pi/4 <= x <= 0)
1679 *
1680 * y2 = *(v4sf*)_ps_sincof_p0;
1681 * y2 = _mm_mul_ps(y2, z);
1682 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1683 * y2 = _mm_mul_ps(y2, z);
1684 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1685 * y2 = _mm_mul_ps(y2, z);
1686 * y2 = _mm_mul_ps(y2, x);
1687 * y2 = _mm_add_ps(y2, x);
1688 */
1689
1690 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1691 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1692 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1693 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1694 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1695 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1696 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1697
1698 /*
1699 * select the correct result from the two polynoms
1700 * xmm3 = poly_mask;
1701 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1702 * y = _mm_andnot_ps(xmm3, y);
1703 * y = _mm_add_ps(y,y2);
1704 */
1705 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1706 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1707 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1708 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1709 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1710 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1711
1712 /*
1713 * update the sign
1714 * y = _mm_xor_ps(y, sign_bit);
1715 */
1716 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1717 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1718 return y_result;
1719 }
1720
1721
1722 /**
1723 * Generate pow(x, y)
1724 */
1725 LLVMValueRef
1726 lp_build_pow(struct lp_build_context *bld,
1727 LLVMValueRef x,
1728 LLVMValueRef y)
1729 {
1730 /* TODO: optimize the constant case */
1731 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1732 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1733 __FUNCTION__);
1734
1735 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1736 }
1737
1738
1739 /**
1740 * Generate exp(x)
1741 */
1742 LLVMValueRef
1743 lp_build_exp(struct lp_build_context *bld,
1744 LLVMValueRef x)
1745 {
1746 /* log2(e) = 1/log(2) */
1747 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1748
1749 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1750 }
1751
1752
1753 /**
1754 * Generate log(x)
1755 */
1756 LLVMValueRef
1757 lp_build_log(struct lp_build_context *bld,
1758 LLVMValueRef x)
1759 {
1760 /* log(2) */
1761 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1762
1763 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1764 }
1765
1766
1767 #define EXP_POLY_DEGREE 3
1768 #define LOG_POLY_DEGREE 5
1769
1770
1771 /**
1772 * Generate polynomial.
1773 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1774 */
1775 static LLVMValueRef
1776 lp_build_polynomial(struct lp_build_context *bld,
1777 LLVMValueRef x,
1778 const double *coeffs,
1779 unsigned num_coeffs)
1780 {
1781 const struct lp_type type = bld->type;
1782 LLVMValueRef res = NULL;
1783 unsigned i;
1784
1785 /* TODO: optimize the constant case */
1786 if(LLVMIsConstant(x))
1787 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1788 __FUNCTION__);
1789
1790 for (i = num_coeffs; i--; ) {
1791 LLVMValueRef coeff;
1792
1793 coeff = lp_build_const_vec(type, coeffs[i]);
1794
1795 if(res)
1796 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1797 else
1798 res = coeff;
1799 }
1800
1801 if(res)
1802 return res;
1803 else
1804 return bld->undef;
1805 }
1806
1807
1808 /**
1809 * Minimax polynomial fit of 2**x, in range [0, 1[
1810 */
1811 const double lp_build_exp2_polynomial[] = {
1812 #if EXP_POLY_DEGREE == 5
1813 0.999999999690134838155,
1814 0.583974334321735217258,
1815 0.164553105719676828492,
1816 0.0292811063701710962255,
1817 0.00354944426657875141846,
1818 0.000296253726543423377365
1819 #elif EXP_POLY_DEGREE == 4
1820 1.00000001502262084505,
1821 0.563586057338685991394,
1822 0.150436017652442413623,
1823 0.0243220604213317927308,
1824 0.0025359088446580436489
1825 #elif EXP_POLY_DEGREE == 3
1826 0.999925218562710312959,
1827 0.695833540494823811697,
1828 0.226067155427249155588,
1829 0.0780245226406372992967
1830 #elif EXP_POLY_DEGREE == 2
1831 1.00172476321474503578,
1832 0.657636275736077639316,
1833 0.33718943461968720704
1834 #else
1835 #error
1836 #endif
1837 };
1838
1839
1840 void
1841 lp_build_exp2_approx(struct lp_build_context *bld,
1842 LLVMValueRef x,
1843 LLVMValueRef *p_exp2_int_part,
1844 LLVMValueRef *p_frac_part,
1845 LLVMValueRef *p_exp2)
1846 {
1847 const struct lp_type type = bld->type;
1848 LLVMTypeRef vec_type = lp_build_vec_type(type);
1849 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1850 LLVMValueRef ipart = NULL;
1851 LLVMValueRef fpart = NULL;
1852 LLVMValueRef expipart = NULL;
1853 LLVMValueRef expfpart = NULL;
1854 LLVMValueRef res = NULL;
1855
1856 if(p_exp2_int_part || p_frac_part || p_exp2) {
1857 /* TODO: optimize the constant case */
1858 if(LLVMIsConstant(x))
1859 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1860 __FUNCTION__);
1861
1862 assert(type.floating && type.width == 32);
1863
1864 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1865 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1866
1867 /* ipart = floor(x) */
1868 ipart = lp_build_floor(bld, x);
1869
1870 /* fpart = x - ipart */
1871 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1872 }
1873
1874 if(p_exp2_int_part || p_exp2) {
1875 /* expipart = (float) (1 << ipart) */
1876 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1877 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1878 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1879 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1880 }
1881
1882 if(p_exp2) {
1883 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1884 Elements(lp_build_exp2_polynomial));
1885
1886 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
1887 }
1888
1889 if(p_exp2_int_part)
1890 *p_exp2_int_part = expipart;
1891
1892 if(p_frac_part)
1893 *p_frac_part = fpart;
1894
1895 if(p_exp2)
1896 *p_exp2 = res;
1897 }
1898
1899
1900 LLVMValueRef
1901 lp_build_exp2(struct lp_build_context *bld,
1902 LLVMValueRef x)
1903 {
1904 LLVMValueRef res;
1905 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1906 return res;
1907 }
1908
1909
1910 /**
1911 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1912 * These coefficients can be generate with
1913 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1914 */
1915 const double lp_build_log2_polynomial[] = {
1916 #if LOG_POLY_DEGREE == 6
1917 3.11578814719469302614,
1918 -3.32419399085241980044,
1919 2.59883907202499966007,
1920 -1.23152682416275988241,
1921 0.318212422185251071475,
1922 -0.0344359067839062357313
1923 #elif LOG_POLY_DEGREE == 5
1924 2.8882704548164776201,
1925 -2.52074962577807006663,
1926 1.48116647521213171641,
1927 -0.465725644288844778798,
1928 0.0596515482674574969533
1929 #elif LOG_POLY_DEGREE == 4
1930 2.61761038894603480148,
1931 -1.75647175389045657003,
1932 0.688243882994381274313,
1933 -0.107254423828329604454
1934 #elif LOG_POLY_DEGREE == 3
1935 2.28330284476918490682,
1936 -1.04913055217340124191,
1937 0.204446009836232697516
1938 #else
1939 #error
1940 #endif
1941 };
1942
1943
1944 /**
1945 * See http://www.devmaster.net/forums/showthread.php?p=43580
1946 */
1947 void
1948 lp_build_log2_approx(struct lp_build_context *bld,
1949 LLVMValueRef x,
1950 LLVMValueRef *p_exp,
1951 LLVMValueRef *p_floor_log2,
1952 LLVMValueRef *p_log2)
1953 {
1954 const struct lp_type type = bld->type;
1955 LLVMTypeRef vec_type = lp_build_vec_type(type);
1956 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1957
1958 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1959 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1960 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1961
1962 LLVMValueRef i = NULL;
1963 LLVMValueRef exp = NULL;
1964 LLVMValueRef mant = NULL;
1965 LLVMValueRef logexp = NULL;
1966 LLVMValueRef logmant = NULL;
1967 LLVMValueRef res = NULL;
1968
1969 if(p_exp || p_floor_log2 || p_log2) {
1970 /* TODO: optimize the constant case */
1971 if(LLVMIsConstant(x))
1972 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1973 __FUNCTION__);
1974
1975 assert(type.floating && type.width == 32);
1976
1977 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1978
1979 /* exp = (float) exponent(x) */
1980 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1981 }
1982
1983 if(p_floor_log2 || p_log2) {
1984 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1985 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1986 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1987 }
1988
1989 if(p_log2) {
1990 /* mant = (float) mantissa(x) */
1991 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1992 mant = LLVMBuildOr(bld->builder, mant, one, "");
1993 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1994
1995 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1996 Elements(lp_build_log2_polynomial));
1997
1998 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1999 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2000
2001 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2002 }
2003
2004 if(p_exp) {
2005 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2006 *p_exp = exp;
2007 }
2008
2009 if(p_floor_log2)
2010 *p_floor_log2 = logexp;
2011
2012 if(p_log2)
2013 *p_log2 = res;
2014 }
2015
2016
2017 LLVMValueRef
2018 lp_build_log2(struct lp_build_context *bld,
2019 LLVMValueRef x)
2020 {
2021 LLVMValueRef res;
2022 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2023 return res;
2024 }