gallivm: Remove unnecessary header.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
60
61
62 /**
63 * Generate min(a, b)
64 * No checks for special case values of a or b = 1 or 0 are done.
65 */
66 static LLVMValueRef
67 lp_build_min_simple(struct lp_build_context *bld,
68 LLVMValueRef a,
69 LLVMValueRef b)
70 {
71 const struct lp_type type = bld->type;
72 const char *intrinsic = NULL;
73 LLVMValueRef cond;
74
75 /* TODO: optimize the constant case */
76
77 if(type.width * type.length == 128) {
78 if(type.floating) {
79 if(type.width == 32 && util_cpu_caps.has_sse)
80 intrinsic = "llvm.x86.sse.min.ps";
81 if(type.width == 64 && util_cpu_caps.has_sse2)
82 intrinsic = "llvm.x86.sse2.min.pd";
83 }
84 else {
85 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
86 intrinsic = "llvm.x86.sse2.pminu.b";
87 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
88 intrinsic = "llvm.x86.sse41.pminsb";
89 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
90 intrinsic = "llvm.x86.sse41.pminuw";
91 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
92 intrinsic = "llvm.x86.sse2.pmins.w";
93 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
94 intrinsic = "llvm.x86.sse41.pminud";
95 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
96 intrinsic = "llvm.x86.sse41.pminsd";
97 }
98 }
99
100 if(intrinsic)
101 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
102
103 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
104 return lp_build_select(bld, cond, a, b);
105 }
106
107
108 /**
109 * Generate max(a, b)
110 * No checks for special case values of a or b = 1 or 0 are done.
111 */
112 static LLVMValueRef
113 lp_build_max_simple(struct lp_build_context *bld,
114 LLVMValueRef a,
115 LLVMValueRef b)
116 {
117 const struct lp_type type = bld->type;
118 const char *intrinsic = NULL;
119 LLVMValueRef cond;
120
121 /* TODO: optimize the constant case */
122
123 if(type.width * type.length == 128) {
124 if(type.floating) {
125 if(type.width == 32 && util_cpu_caps.has_sse)
126 intrinsic = "llvm.x86.sse.max.ps";
127 if(type.width == 64 && util_cpu_caps.has_sse2)
128 intrinsic = "llvm.x86.sse2.max.pd";
129 }
130 else {
131 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
132 intrinsic = "llvm.x86.sse2.pmaxu.b";
133 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
134 intrinsic = "llvm.x86.sse41.pmaxsb";
135 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
136 intrinsic = "llvm.x86.sse41.pmaxuw";
137 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
138 intrinsic = "llvm.x86.sse2.pmaxs.w";
139 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
140 intrinsic = "llvm.x86.sse41.pmaxud";
141 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
142 intrinsic = "llvm.x86.sse41.pmaxsd";
143 }
144 }
145
146 if(intrinsic)
147 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
148
149 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
150 return lp_build_select(bld, cond, a, b);
151 }
152
153
154 /**
155 * Generate 1 - a, or ~a depending on bld->type.
156 */
157 LLVMValueRef
158 lp_build_comp(struct lp_build_context *bld,
159 LLVMValueRef a)
160 {
161 const struct lp_type type = bld->type;
162
163 if(a == bld->one)
164 return bld->zero;
165 if(a == bld->zero)
166 return bld->one;
167
168 if(type.norm && !type.floating && !type.fixed && !type.sign) {
169 if(LLVMIsConstant(a))
170 return LLVMConstNot(a);
171 else
172 return LLVMBuildNot(bld->builder, a, "");
173 }
174
175 if(LLVMIsConstant(a))
176 return LLVMConstSub(bld->one, a);
177 else
178 return LLVMBuildSub(bld->builder, bld->one, a, "");
179 }
180
181
182 /**
183 * Generate a + b
184 */
185 LLVMValueRef
186 lp_build_add(struct lp_build_context *bld,
187 LLVMValueRef a,
188 LLVMValueRef b)
189 {
190 const struct lp_type type = bld->type;
191 LLVMValueRef res;
192
193 if(a == bld->zero)
194 return b;
195 if(b == bld->zero)
196 return a;
197 if(a == bld->undef || b == bld->undef)
198 return bld->undef;
199
200 if(bld->type.norm) {
201 const char *intrinsic = NULL;
202
203 if(a == bld->one || b == bld->one)
204 return bld->one;
205
206 if(util_cpu_caps.has_sse2 &&
207 type.width * type.length == 128 &&
208 !type.floating && !type.fixed) {
209 if(type.width == 8)
210 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
211 if(type.width == 16)
212 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
213 }
214
215 if(intrinsic)
216 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
217 }
218
219 if(LLVMIsConstant(a) && LLVMIsConstant(b))
220 res = LLVMConstAdd(a, b);
221 else
222 res = LLVMBuildAdd(bld->builder, a, b, "");
223
224 /* clamp to ceiling of 1.0 */
225 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
226 res = lp_build_min_simple(bld, res, bld->one);
227
228 /* XXX clamp to floor of -1 or 0??? */
229
230 return res;
231 }
232
233
234 /** Return the sum of the elements of a */
235 LLVMValueRef
236 lp_build_sum_vector(struct lp_build_context *bld,
237 LLVMValueRef a)
238 {
239 const struct lp_type type = bld->type;
240 LLVMValueRef index, res;
241 unsigned i;
242
243 if (a == bld->zero)
244 return bld->zero;
245 if (a == bld->undef)
246 return bld->undef;
247 assert(type.length > 1);
248
249 assert(!bld->type.norm);
250
251 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
252 res = LLVMBuildExtractElement(bld->builder, a, index, "");
253
254 for (i = 1; i < type.length; i++) {
255 index = LLVMConstInt(LLVMInt32Type(), i, 0);
256 res = LLVMBuildAdd(bld->builder, res,
257 LLVMBuildExtractElement(bld->builder, a, index, ""),
258 "");
259 }
260
261 return res;
262 }
263
264
265 /**
266 * Generate a - b
267 */
268 LLVMValueRef
269 lp_build_sub(struct lp_build_context *bld,
270 LLVMValueRef a,
271 LLVMValueRef b)
272 {
273 const struct lp_type type = bld->type;
274 LLVMValueRef res;
275
276 if(b == bld->zero)
277 return a;
278 if(a == bld->undef || b == bld->undef)
279 return bld->undef;
280 if(a == b)
281 return bld->zero;
282
283 if(bld->type.norm) {
284 const char *intrinsic = NULL;
285
286 if(b == bld->one)
287 return bld->zero;
288
289 if(util_cpu_caps.has_sse2 &&
290 type.width * type.length == 128 &&
291 !type.floating && !type.fixed) {
292 if(type.width == 8)
293 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
294 if(type.width == 16)
295 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
296 }
297
298 if(intrinsic)
299 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
300 }
301
302 if(LLVMIsConstant(a) && LLVMIsConstant(b))
303 res = LLVMConstSub(a, b);
304 else
305 res = LLVMBuildSub(bld->builder, a, b, "");
306
307 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
308 res = lp_build_max_simple(bld, res, bld->zero);
309
310 return res;
311 }
312
313
314 /**
315 * Normalized 8bit multiplication.
316 *
317 * - alpha plus one
318 *
319 * makes the following approximation to the division (Sree)
320 *
321 * a*b/255 ~= (a*(b + 1)) >> 256
322 *
323 * which is the fastest method that satisfies the following OpenGL criteria
324 *
325 * 0*0 = 0 and 255*255 = 255
326 *
327 * - geometric series
328 *
329 * takes the geometric series approximation to the division
330 *
331 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
332 *
333 * in this case just the first two terms to fit in 16bit arithmetic
334 *
335 * t/255 ~= (t + (t >> 8)) >> 8
336 *
337 * note that just by itself it doesn't satisfies the OpenGL criteria, as
338 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
339 * must be used
340 *
341 * - geometric series plus rounding
342 *
343 * when using a geometric series division instead of truncating the result
344 * use roundoff in the approximation (Jim Blinn)
345 *
346 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
347 *
348 * achieving the exact results
349 *
350 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
351 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
352 * @sa Michael Herf, The "double blend trick", May 2000,
353 * http://www.stereopsis.com/doubleblend.html
354 */
355 static LLVMValueRef
356 lp_build_mul_u8n(LLVMBuilderRef builder,
357 struct lp_type i16_type,
358 LLVMValueRef a, LLVMValueRef b)
359 {
360 LLVMValueRef c8;
361 LLVMValueRef ab;
362
363 c8 = lp_build_const_int_vec(i16_type, 8);
364
365 #if 0
366
367 /* a*b/255 ~= (a*(b + 1)) >> 256 */
368 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
369 ab = LLVMBuildMul(builder, a, b, "");
370
371 #else
372
373 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
374 ab = LLVMBuildMul(builder, a, b, "");
375 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
376 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
377
378 #endif
379
380 ab = LLVMBuildLShr(builder, ab, c8, "");
381
382 return ab;
383 }
384
385
386 /**
387 * Generate a * b
388 */
389 LLVMValueRef
390 lp_build_mul(struct lp_build_context *bld,
391 LLVMValueRef a,
392 LLVMValueRef b)
393 {
394 const struct lp_type type = bld->type;
395 LLVMValueRef shift;
396 LLVMValueRef res;
397
398 if(a == bld->zero)
399 return bld->zero;
400 if(a == bld->one)
401 return b;
402 if(b == bld->zero)
403 return bld->zero;
404 if(b == bld->one)
405 return a;
406 if(a == bld->undef || b == bld->undef)
407 return bld->undef;
408
409 if(!type.floating && !type.fixed && type.norm) {
410 if(type.width == 8) {
411 struct lp_type i16_type = lp_wider_type(type);
412 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
413
414 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
415 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
416
417 /* PMULLW, PSRLW, PADDW */
418 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
419 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
420
421 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
422
423 return ab;
424 }
425
426 /* FIXME */
427 assert(0);
428 }
429
430 if(type.fixed)
431 shift = lp_build_const_int_vec(type, type.width/2);
432 else
433 shift = NULL;
434
435 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
436 res = LLVMConstMul(a, b);
437 if(shift) {
438 if(type.sign)
439 res = LLVMConstAShr(res, shift);
440 else
441 res = LLVMConstLShr(res, shift);
442 }
443 }
444 else {
445 res = LLVMBuildMul(bld->builder, a, b, "");
446 if(shift) {
447 if(type.sign)
448 res = LLVMBuildAShr(bld->builder, res, shift, "");
449 else
450 res = LLVMBuildLShr(bld->builder, res, shift, "");
451 }
452 }
453
454 return res;
455 }
456
457
458 /**
459 * Small vector x scale multiplication optimization.
460 */
461 LLVMValueRef
462 lp_build_mul_imm(struct lp_build_context *bld,
463 LLVMValueRef a,
464 int b)
465 {
466 LLVMValueRef factor;
467
468 if(b == 0)
469 return bld->zero;
470
471 if(b == 1)
472 return a;
473
474 if(b == -1)
475 return LLVMBuildNeg(bld->builder, a, "");
476
477 if(b == 2 && bld->type.floating)
478 return lp_build_add(bld, a, a);
479
480 if(util_is_pot(b)) {
481 unsigned shift = ffs(b) - 1;
482
483 if(bld->type.floating) {
484 #if 0
485 /*
486 * Power of two multiplication by directly manipulating the mantissa.
487 *
488 * XXX: This might not be always faster, it will introduce a small error
489 * for multiplication by zero, and it will produce wrong results
490 * for Inf and NaN.
491 */
492 unsigned mantissa = lp_mantissa(bld->type);
493 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
494 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
495 a = LLVMBuildAdd(bld->builder, a, factor, "");
496 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
497 return a;
498 #endif
499 }
500 else {
501 factor = lp_build_const_vec(bld->type, shift);
502 return LLVMBuildShl(bld->builder, a, factor, "");
503 }
504 }
505
506 factor = lp_build_const_vec(bld->type, (double)b);
507 return lp_build_mul(bld, a, factor);
508 }
509
510
511 /**
512 * Generate a / b
513 */
514 LLVMValueRef
515 lp_build_div(struct lp_build_context *bld,
516 LLVMValueRef a,
517 LLVMValueRef b)
518 {
519 const struct lp_type type = bld->type;
520
521 if(a == bld->zero)
522 return bld->zero;
523 if(a == bld->one)
524 return lp_build_rcp(bld, b);
525 if(b == bld->zero)
526 return bld->undef;
527 if(b == bld->one)
528 return a;
529 if(a == bld->undef || b == bld->undef)
530 return bld->undef;
531
532 if(LLVMIsConstant(a) && LLVMIsConstant(b))
533 return LLVMConstFDiv(a, b);
534
535 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
536 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
537
538 return LLVMBuildFDiv(bld->builder, a, b, "");
539 }
540
541
542 /**
543 * Linear interpolation.
544 *
545 * This also works for integer values with a few caveats.
546 *
547 * @sa http://www.stereopsis.com/doubleblend.html
548 */
549 LLVMValueRef
550 lp_build_lerp(struct lp_build_context *bld,
551 LLVMValueRef x,
552 LLVMValueRef v0,
553 LLVMValueRef v1)
554 {
555 LLVMValueRef delta;
556 LLVMValueRef res;
557
558 delta = lp_build_sub(bld, v1, v0);
559
560 res = lp_build_mul(bld, x, delta);
561
562 res = lp_build_add(bld, v0, res);
563
564 if(bld->type.fixed)
565 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
566 * but it will be wrong for other uses. Basically we need a more
567 * powerful lp_type, capable of further distinguishing the values
568 * interpretation from the value storage. */
569 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
570
571 return res;
572 }
573
574
575 LLVMValueRef
576 lp_build_lerp_2d(struct lp_build_context *bld,
577 LLVMValueRef x,
578 LLVMValueRef y,
579 LLVMValueRef v00,
580 LLVMValueRef v01,
581 LLVMValueRef v10,
582 LLVMValueRef v11)
583 {
584 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
585 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
586 return lp_build_lerp(bld, y, v0, v1);
587 }
588
589
590 /**
591 * Generate min(a, b)
592 * Do checks for special cases.
593 */
594 LLVMValueRef
595 lp_build_min(struct lp_build_context *bld,
596 LLVMValueRef a,
597 LLVMValueRef b)
598 {
599 if(a == bld->undef || b == bld->undef)
600 return bld->undef;
601
602 if(a == b)
603 return a;
604
605 if(bld->type.norm) {
606 if(a == bld->zero || b == bld->zero)
607 return bld->zero;
608 if(a == bld->one)
609 return b;
610 if(b == bld->one)
611 return a;
612 }
613
614 return lp_build_min_simple(bld, a, b);
615 }
616
617
618 /**
619 * Generate max(a, b)
620 * Do checks for special cases.
621 */
622 LLVMValueRef
623 lp_build_max(struct lp_build_context *bld,
624 LLVMValueRef a,
625 LLVMValueRef b)
626 {
627 if(a == bld->undef || b == bld->undef)
628 return bld->undef;
629
630 if(a == b)
631 return a;
632
633 if(bld->type.norm) {
634 if(a == bld->one || b == bld->one)
635 return bld->one;
636 if(a == bld->zero)
637 return b;
638 if(b == bld->zero)
639 return a;
640 }
641
642 return lp_build_max_simple(bld, a, b);
643 }
644
645
646 /**
647 * Generate clamp(a, min, max)
648 * Do checks for special cases.
649 */
650 LLVMValueRef
651 lp_build_clamp(struct lp_build_context *bld,
652 LLVMValueRef a,
653 LLVMValueRef min,
654 LLVMValueRef max)
655 {
656 a = lp_build_min(bld, a, max);
657 a = lp_build_max(bld, a, min);
658 return a;
659 }
660
661
662 /**
663 * Generate abs(a)
664 */
665 LLVMValueRef
666 lp_build_abs(struct lp_build_context *bld,
667 LLVMValueRef a)
668 {
669 const struct lp_type type = bld->type;
670 LLVMTypeRef vec_type = lp_build_vec_type(type);
671
672 if(!type.sign)
673 return a;
674
675 if(type.floating) {
676 /* Mask out the sign bit */
677 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
678 unsigned long long absMask = ~(1ULL << (type.width - 1));
679 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
680 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
681 a = LLVMBuildAnd(bld->builder, a, mask, "");
682 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
683 return a;
684 }
685
686 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
687 switch(type.width) {
688 case 8:
689 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
690 case 16:
691 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
692 case 32:
693 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
694 }
695 }
696
697 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
698 }
699
700
701 LLVMValueRef
702 lp_build_negate(struct lp_build_context *bld,
703 LLVMValueRef a)
704 {
705 return LLVMBuildNeg(bld->builder, a, "");
706 }
707
708
709 /** Return -1, 0 or +1 depending on the sign of a */
710 LLVMValueRef
711 lp_build_sgn(struct lp_build_context *bld,
712 LLVMValueRef a)
713 {
714 const struct lp_type type = bld->type;
715 LLVMValueRef cond;
716 LLVMValueRef res;
717
718 /* Handle non-zero case */
719 if(!type.sign) {
720 /* if not zero then sign must be positive */
721 res = bld->one;
722 }
723 else if(type.floating) {
724 LLVMTypeRef vec_type;
725 LLVMTypeRef int_type;
726 LLVMValueRef mask;
727 LLVMValueRef sign;
728 LLVMValueRef one;
729 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
730
731 int_type = lp_build_int_vec_type(type);
732 vec_type = lp_build_vec_type(type);
733 mask = lp_build_const_int_vec(type, maskBit);
734
735 /* Take the sign bit and add it to 1 constant */
736 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
737 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
738 one = LLVMConstBitCast(bld->one, int_type);
739 res = LLVMBuildOr(bld->builder, sign, one, "");
740 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
741 }
742 else
743 {
744 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
745 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
746 res = lp_build_select(bld, cond, bld->one, minus_one);
747 }
748
749 /* Handle zero */
750 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
751 res = lp_build_select(bld, cond, bld->zero, res);
752
753 return res;
754 }
755
756
757 /**
758 * Set the sign of float vector 'a' according to 'sign'.
759 * If sign==0, return abs(a).
760 * If sign==1, return -abs(a);
761 * Other values for sign produce undefined results.
762 */
763 LLVMValueRef
764 lp_build_set_sign(struct lp_build_context *bld,
765 LLVMValueRef a, LLVMValueRef sign)
766 {
767 const struct lp_type type = bld->type;
768 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
769 LLVMTypeRef vec_type = lp_build_vec_type(type);
770 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
771 LLVMValueRef mask = lp_build_const_int_vec(type,
772 ~((unsigned long long) 1 << (type.width - 1)));
773 LLVMValueRef val, res;
774
775 assert(type.floating);
776
777 /* val = reinterpret_cast<int>(a) */
778 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
779 /* val = val & mask */
780 val = LLVMBuildAnd(bld->builder, val, mask, "");
781 /* sign = sign << shift */
782 sign = LLVMBuildShl(bld->builder, sign, shift, "");
783 /* res = val | sign */
784 res = LLVMBuildOr(bld->builder, val, sign, "");
785 /* res = reinterpret_cast<float>(res) */
786 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
787
788 return res;
789 }
790
791
792 /**
793 * Convert vector of (or scalar) int to vector of (or scalar) float.
794 */
795 LLVMValueRef
796 lp_build_int_to_float(struct lp_build_context *bld,
797 LLVMValueRef a)
798 {
799 const struct lp_type type = bld->type;
800 LLVMTypeRef vec_type = lp_build_vec_type(type);
801
802 assert(type.floating);
803
804 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
805 }
806
807
808
809 enum lp_build_round_sse41_mode
810 {
811 LP_BUILD_ROUND_SSE41_NEAREST = 0,
812 LP_BUILD_ROUND_SSE41_FLOOR = 1,
813 LP_BUILD_ROUND_SSE41_CEIL = 2,
814 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
815 };
816
817
818 static INLINE LLVMValueRef
819 lp_build_round_sse41(struct lp_build_context *bld,
820 LLVMValueRef a,
821 enum lp_build_round_sse41_mode mode)
822 {
823 const struct lp_type type = bld->type;
824 LLVMTypeRef vec_type = lp_build_vec_type(type);
825 const char *intrinsic;
826
827 assert(type.floating);
828 assert(type.width*type.length == 128);
829 assert(lp_check_value(type, a));
830 assert(util_cpu_caps.has_sse4_1);
831
832 switch(type.width) {
833 case 32:
834 intrinsic = "llvm.x86.sse41.round.ps";
835 break;
836 case 64:
837 intrinsic = "llvm.x86.sse41.round.pd";
838 break;
839 default:
840 assert(0);
841 return bld->undef;
842 }
843
844 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
845 LLVMConstInt(LLVMInt32Type(), mode, 0));
846 }
847
848
849 /**
850 * Return the integer part of a float (vector) value. The returned value is
851 * a float (vector).
852 * Ex: trunc(-1.5) = 1.0
853 */
854 LLVMValueRef
855 lp_build_trunc(struct lp_build_context *bld,
856 LLVMValueRef a)
857 {
858 const struct lp_type type = bld->type;
859
860 assert(type.floating);
861 assert(lp_check_value(type, a));
862
863 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
864 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
865 else {
866 LLVMTypeRef vec_type = lp_build_vec_type(type);
867 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
868 LLVMValueRef res;
869 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
870 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
871 return res;
872 }
873 }
874
875
876 /**
877 * Return float (vector) rounded to nearest integer (vector). The returned
878 * value is a float (vector).
879 * Ex: round(0.9) = 1.0
880 * Ex: round(-1.5) = -2.0
881 */
882 LLVMValueRef
883 lp_build_round(struct lp_build_context *bld,
884 LLVMValueRef a)
885 {
886 const struct lp_type type = bld->type;
887
888 assert(type.floating);
889 assert(lp_check_value(type, a));
890
891 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
892 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
893 else {
894 LLVMTypeRef vec_type = lp_build_vec_type(type);
895 LLVMValueRef res;
896 res = lp_build_iround(bld, a);
897 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
898 return res;
899 }
900 }
901
902
903 /**
904 * Return floor of float (vector), result is a float (vector)
905 * Ex: floor(1.1) = 1.0
906 * Ex: floor(-1.1) = -2.0
907 */
908 LLVMValueRef
909 lp_build_floor(struct lp_build_context *bld,
910 LLVMValueRef a)
911 {
912 const struct lp_type type = bld->type;
913
914 assert(type.floating);
915 assert(lp_check_value(type, a));
916
917 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
918 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
919 else {
920 LLVMTypeRef vec_type = lp_build_vec_type(type);
921 LLVMValueRef res;
922 res = lp_build_ifloor(bld, a);
923 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
924 return res;
925 }
926 }
927
928
929 /**
930 * Return ceiling of float (vector), returning float (vector).
931 * Ex: ceil( 1.1) = 2.0
932 * Ex: ceil(-1.1) = -1.0
933 */
934 LLVMValueRef
935 lp_build_ceil(struct lp_build_context *bld,
936 LLVMValueRef a)
937 {
938 const struct lp_type type = bld->type;
939
940 assert(type.floating);
941 assert(lp_check_value(type, a));
942
943 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
944 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
945 else {
946 LLVMTypeRef vec_type = lp_build_vec_type(type);
947 LLVMValueRef res;
948 res = lp_build_iceil(bld, a);
949 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
950 return res;
951 }
952 }
953
954
955 /**
956 * Return fractional part of 'a' computed as a - floor(a)
957 * Typically used in texture coord arithmetic.
958 */
959 LLVMValueRef
960 lp_build_fract(struct lp_build_context *bld,
961 LLVMValueRef a)
962 {
963 assert(bld->type.floating);
964 return lp_build_sub(bld, a, lp_build_floor(bld, a));
965 }
966
967
968 /**
969 * Return the integer part of a float (vector) value. The returned value is
970 * an integer (vector).
971 * Ex: itrunc(-1.5) = 1
972 */
973 LLVMValueRef
974 lp_build_itrunc(struct lp_build_context *bld,
975 LLVMValueRef a)
976 {
977 const struct lp_type type = bld->type;
978 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
979
980 assert(type.floating);
981 assert(lp_check_value(type, a));
982
983 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
984 }
985
986
987 /**
988 * Return float (vector) rounded to nearest integer (vector). The returned
989 * value is an integer (vector).
990 * Ex: iround(0.9) = 1
991 * Ex: iround(-1.5) = -2
992 */
993 LLVMValueRef
994 lp_build_iround(struct lp_build_context *bld,
995 LLVMValueRef a)
996 {
997 const struct lp_type type = bld->type;
998 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
999 LLVMValueRef res;
1000
1001 assert(type.floating);
1002
1003 assert(lp_check_value(type, a));
1004
1005 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1006 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1007 }
1008 else {
1009 LLVMTypeRef vec_type = lp_build_vec_type(type);
1010 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1011 LLVMValueRef sign;
1012 LLVMValueRef half;
1013
1014 /* get sign bit */
1015 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1016 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1017
1018 /* sign * 0.5 */
1019 half = lp_build_const_vec(type, 0.5);
1020 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1021 half = LLVMBuildOr(bld->builder, sign, half, "");
1022 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1023
1024 res = LLVMBuildAdd(bld->builder, a, half, "");
1025 }
1026
1027 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1028
1029 return res;
1030 }
1031
1032
1033 /**
1034 * Return floor of float (vector), result is an int (vector)
1035 * Ex: ifloor(1.1) = 1.0
1036 * Ex: ifloor(-1.1) = -2.0
1037 */
1038 LLVMValueRef
1039 lp_build_ifloor(struct lp_build_context *bld,
1040 LLVMValueRef a)
1041 {
1042 const struct lp_type type = bld->type;
1043 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1044 LLVMValueRef res;
1045
1046 assert(type.floating);
1047 assert(lp_check_value(type, a));
1048
1049 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1050 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1051 }
1052 else {
1053 /* Take the sign bit and add it to 1 constant */
1054 LLVMTypeRef vec_type = lp_build_vec_type(type);
1055 unsigned mantissa = lp_mantissa(type);
1056 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1057 LLVMValueRef sign;
1058 LLVMValueRef offset;
1059
1060 /* sign = a < 0 ? ~0 : 0 */
1061 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1062 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1063 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1064
1065 /* offset = -0.99999(9)f */
1066 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1067 offset = LLVMConstBitCast(offset, int_vec_type);
1068
1069 /* offset = a < 0 ? offset : 0.0f */
1070 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1071 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1072
1073 res = LLVMBuildAdd(bld->builder, a, offset, "ifloor.res");
1074 }
1075
1076 /* round to nearest (toward zero) */
1077 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1078
1079 return res;
1080 }
1081
1082
1083 /**
1084 * Return ceiling of float (vector), returning int (vector).
1085 * Ex: iceil( 1.1) = 2
1086 * Ex: iceil(-1.1) = -1
1087 */
1088 LLVMValueRef
1089 lp_build_iceil(struct lp_build_context *bld,
1090 LLVMValueRef a)
1091 {
1092 const struct lp_type type = bld->type;
1093 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1094 LLVMValueRef res;
1095
1096 assert(type.floating);
1097 assert(lp_check_value(type, a));
1098
1099 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1100 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1101 }
1102 else {
1103 LLVMTypeRef vec_type = lp_build_vec_type(type);
1104 unsigned mantissa = lp_mantissa(type);
1105 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1106 LLVMValueRef sign;
1107 LLVMValueRef offset;
1108
1109 /* sign = a < 0 ? 0 : ~0 */
1110 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1111 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1112 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1113 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1114
1115 /* offset = 0.99999(9)f */
1116 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1117 offset = LLVMConstBitCast(offset, int_vec_type);
1118
1119 /* offset = a < 0 ? 0.0 : offset */
1120 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1121 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1122
1123 res = LLVMBuildAdd(bld->builder, a, offset, "iceil.res");
1124 }
1125
1126 /* round to nearest (toward zero) */
1127 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1128
1129 return res;
1130 }
1131
1132
1133 LLVMValueRef
1134 lp_build_sqrt(struct lp_build_context *bld,
1135 LLVMValueRef a)
1136 {
1137 const struct lp_type type = bld->type;
1138 LLVMTypeRef vec_type = lp_build_vec_type(type);
1139 char intrinsic[32];
1140
1141 /* TODO: optimize the constant case */
1142 /* TODO: optimize the constant case */
1143
1144 assert(type.floating);
1145 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1146
1147 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1148 }
1149
1150
1151 LLVMValueRef
1152 lp_build_rcp(struct lp_build_context *bld,
1153 LLVMValueRef a)
1154 {
1155 const struct lp_type type = bld->type;
1156
1157 if(a == bld->zero)
1158 return bld->undef;
1159 if(a == bld->one)
1160 return bld->one;
1161 if(a == bld->undef)
1162 return bld->undef;
1163
1164 assert(type.floating);
1165
1166 if(LLVMIsConstant(a))
1167 return LLVMConstFDiv(bld->one, a);
1168
1169 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1170 /*
1171 * XXX: Added precision is not always necessary, so only enable this
1172 * when we have a better system in place to track minimum precision.
1173 */
1174
1175 #if 0
1176 /*
1177 * Do one Newton-Raphson step to improve precision:
1178 *
1179 * x1 = (2 - a * rcp(a)) * rcp(a)
1180 */
1181
1182 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1183 LLVMValueRef rcp_a;
1184 LLVMValueRef res;
1185
1186 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1187
1188 res = LLVMBuildMul(bld->builder, a, rcp_a, "");
1189 res = LLVMBuildSub(bld->builder, two, res, "");
1190 res = LLVMBuildMul(bld->builder, res, rcp_a, "");
1191
1192 return rcp_a;
1193 #else
1194 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1195 #endif
1196 }
1197
1198 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1199 }
1200
1201
1202 /**
1203 * Generate 1/sqrt(a)
1204 */
1205 LLVMValueRef
1206 lp_build_rsqrt(struct lp_build_context *bld,
1207 LLVMValueRef a)
1208 {
1209 const struct lp_type type = bld->type;
1210
1211 assert(type.floating);
1212
1213 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1214 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1215
1216 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1217 }
1218
1219
1220 static inline LLVMValueRef
1221 lp_build_const_v4si(unsigned long value)
1222 {
1223 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1224 LLVMValueRef elements[4] = { element, element, element, element };
1225 return LLVMConstVector(elements, 4);
1226 }
1227
1228 static inline LLVMValueRef
1229 lp_build_const_v4sf(float value)
1230 {
1231 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1232 LLVMValueRef elements[4] = { element, element, element, element };
1233 return LLVMConstVector(elements, 4);
1234 }
1235
1236
1237 /**
1238 * Generate sin(a) using SSE2
1239 */
1240 LLVMValueRef
1241 lp_build_sin(struct lp_build_context *bld,
1242 LLVMValueRef a)
1243 {
1244 struct lp_type int_type = lp_int_type(bld->type);
1245 LLVMBuilderRef b = bld->builder;
1246 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1247 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1248
1249 /*
1250 * take the absolute value,
1251 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1252 */
1253
1254 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1255 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1256
1257 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1258 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1259
1260 /*
1261 * extract the sign bit (upper one)
1262 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1263 */
1264 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1265 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1266
1267 /*
1268 * scale by 4/Pi
1269 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1270 */
1271
1272 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1273 LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
1274
1275 /*
1276 * store the integer part of y in mm0
1277 * emm2 = _mm_cvttps_epi32(y);
1278 */
1279
1280 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1281
1282 /*
1283 * j=(j+1) & (~1) (see the cephes sources)
1284 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1285 */
1286
1287 LLVMValueRef all_one = lp_build_const_v4si(1);
1288 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1289 /*
1290 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1291 */
1292 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1293 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1294
1295 /*
1296 * y = _mm_cvtepi32_ps(emm2);
1297 */
1298 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1299
1300 /* get the swap sign flag
1301 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1302 */
1303 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1304 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1305
1306 /*
1307 * emm2 = _mm_slli_epi32(emm0, 29);
1308 */
1309 LLVMValueRef const_29 = lp_build_const_v4si(29);
1310 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1311
1312 /*
1313 * get the polynom selection mask
1314 * there is one polynom for 0 <= x <= Pi/4
1315 * and another one for Pi/4<x<=Pi/2
1316 * Both branches will be computed.
1317 *
1318 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1319 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1320 */
1321
1322 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1323 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1324 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1325 emm2_3, lp_build_const_v4si(0));
1326 /*
1327 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1328 */
1329 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1330
1331 /*
1332 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1333 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1334 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1335 */
1336 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1337 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1338 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1339
1340 /*
1341 * The magic pass: "Extended precision modular arithmetic"
1342 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1343 * xmm1 = _mm_mul_ps(y, xmm1);
1344 * xmm2 = _mm_mul_ps(y, xmm2);
1345 * xmm3 = _mm_mul_ps(y, xmm3);
1346 */
1347 LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
1348 LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
1349 LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
1350
1351 /*
1352 * x = _mm_add_ps(x, xmm1);
1353 * x = _mm_add_ps(x, xmm2);
1354 * x = _mm_add_ps(x, xmm3);
1355 */
1356
1357 LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
1358 LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
1359 LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
1360
1361 /*
1362 * Evaluate the first polynom (0 <= x <= Pi/4)
1363 *
1364 * z = _mm_mul_ps(x,x);
1365 */
1366 LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
1367
1368 /*
1369 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1370 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1371 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1372 */
1373 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1374 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1375 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1376
1377 /*
1378 * y = *(v4sf*)_ps_coscof_p0;
1379 * y = _mm_mul_ps(y, z);
1380 */
1381 LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
1382 LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
1383 LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
1384 LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
1385 LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
1386 LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
1387
1388
1389 /*
1390 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1391 * y = _mm_sub_ps(y, tmp);
1392 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1393 */
1394 LLVMValueRef half = lp_build_const_v4sf(0.5);
1395 LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
1396 LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
1397 LLVMValueRef one = lp_build_const_v4sf(1.0);
1398 LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
1399
1400 /*
1401 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1402 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1403 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1404 */
1405 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1406 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1407 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1408
1409 /*
1410 * Evaluate the second polynom (Pi/4 <= x <= 0)
1411 *
1412 * y2 = *(v4sf*)_ps_sincof_p0;
1413 * y2 = _mm_mul_ps(y2, z);
1414 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1415 * y2 = _mm_mul_ps(y2, z);
1416 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1417 * y2 = _mm_mul_ps(y2, z);
1418 * y2 = _mm_mul_ps(y2, x);
1419 * y2 = _mm_add_ps(y2, x);
1420 */
1421
1422 LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
1423 LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
1424 LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
1425 LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
1426 LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
1427 LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
1428 LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
1429
1430 /*
1431 * select the correct result from the two polynoms
1432 * xmm3 = poly_mask;
1433 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1434 * y = _mm_andnot_ps(xmm3, y);
1435 * y = _mm_add_ps(y,y2);
1436 */
1437 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1438 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1439 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1440 LLVMValueRef inv = lp_build_const_v4si(~0);
1441 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1442 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1443 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1444
1445 /*
1446 * update the sign
1447 * y = _mm_xor_ps(y, sign_bit);
1448 */
1449 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1450 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1451 return y_result;
1452 }
1453
1454
1455 /**
1456 * Generate cos(a) using SSE2
1457 */
1458 LLVMValueRef
1459 lp_build_cos(struct lp_build_context *bld,
1460 LLVMValueRef a)
1461 {
1462 struct lp_type int_type = lp_int_type(bld->type);
1463 LLVMBuilderRef b = bld->builder;
1464 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1465 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1466
1467 /*
1468 * take the absolute value,
1469 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1470 */
1471
1472 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1473 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1474
1475 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1476 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1477
1478 /*
1479 * scale by 4/Pi
1480 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1481 */
1482
1483 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1484 LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
1485
1486 /*
1487 * store the integer part of y in mm0
1488 * emm2 = _mm_cvttps_epi32(y);
1489 */
1490
1491 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1492
1493 /*
1494 * j=(j+1) & (~1) (see the cephes sources)
1495 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1496 */
1497
1498 LLVMValueRef all_one = lp_build_const_v4si(1);
1499 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1500 /*
1501 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1502 */
1503 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1504 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1505
1506 /*
1507 * y = _mm_cvtepi32_ps(emm2);
1508 */
1509 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1510
1511
1512 /*
1513 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1514 */
1515 LLVMValueRef const_2 = lp_build_const_v4si(2);
1516 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1517
1518
1519 /* get the swap sign flag
1520 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1521 */
1522 LLVMValueRef inv = lp_build_const_v4si(~0);
1523 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1524 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1525 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1526
1527 /*
1528 * emm2 = _mm_slli_epi32(emm0, 29);
1529 */
1530 LLVMValueRef const_29 = lp_build_const_v4si(29);
1531 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1532
1533 /*
1534 * get the polynom selection mask
1535 * there is one polynom for 0 <= x <= Pi/4
1536 * and another one for Pi/4<x<=Pi/2
1537 * Both branches will be computed.
1538 *
1539 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1540 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1541 */
1542
1543 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1544 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1545 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1546 emm2_3, lp_build_const_v4si(0));
1547
1548 /*
1549 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1550 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1551 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1552 */
1553 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1554 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1555 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1556
1557 /*
1558 * The magic pass: "Extended precision modular arithmetic"
1559 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1560 * xmm1 = _mm_mul_ps(y, xmm1);
1561 * xmm2 = _mm_mul_ps(y, xmm2);
1562 * xmm3 = _mm_mul_ps(y, xmm3);
1563 */
1564 LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
1565 LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
1566 LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
1567
1568 /*
1569 * x = _mm_add_ps(x, xmm1);
1570 * x = _mm_add_ps(x, xmm2);
1571 * x = _mm_add_ps(x, xmm3);
1572 */
1573
1574 LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
1575 LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
1576 LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
1577
1578 /*
1579 * Evaluate the first polynom (0 <= x <= Pi/4)
1580 *
1581 * z = _mm_mul_ps(x,x);
1582 */
1583 LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
1584
1585 /*
1586 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1587 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1588 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1589 */
1590 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1591 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1592 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1593
1594 /*
1595 * y = *(v4sf*)_ps_coscof_p0;
1596 * y = _mm_mul_ps(y, z);
1597 */
1598 LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
1599 LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
1600 LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
1601 LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
1602 LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
1603 LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
1604
1605
1606 /*
1607 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1608 * y = _mm_sub_ps(y, tmp);
1609 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1610 */
1611 LLVMValueRef half = lp_build_const_v4sf(0.5);
1612 LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
1613 LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
1614 LLVMValueRef one = lp_build_const_v4sf(1.0);
1615 LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
1616
1617 /*
1618 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1619 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1620 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1621 */
1622 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1623 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1624 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1625
1626 /*
1627 * Evaluate the second polynom (Pi/4 <= x <= 0)
1628 *
1629 * y2 = *(v4sf*)_ps_sincof_p0;
1630 * y2 = _mm_mul_ps(y2, z);
1631 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1632 * y2 = _mm_mul_ps(y2, z);
1633 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1634 * y2 = _mm_mul_ps(y2, z);
1635 * y2 = _mm_mul_ps(y2, x);
1636 * y2 = _mm_add_ps(y2, x);
1637 */
1638
1639 LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
1640 LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
1641 LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
1642 LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
1643 LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
1644 LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
1645 LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
1646
1647 /*
1648 * select the correct result from the two polynoms
1649 * xmm3 = poly_mask;
1650 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1651 * y = _mm_andnot_ps(xmm3, y);
1652 * y = _mm_add_ps(y,y2);
1653 */
1654 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1655 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1656 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1657 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1658 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1659 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1660
1661 /*
1662 * update the sign
1663 * y = _mm_xor_ps(y, sign_bit);
1664 */
1665 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1666 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1667 return y_result;
1668 }
1669
1670
1671 /**
1672 * Generate pow(x, y)
1673 */
1674 LLVMValueRef
1675 lp_build_pow(struct lp_build_context *bld,
1676 LLVMValueRef x,
1677 LLVMValueRef y)
1678 {
1679 /* TODO: optimize the constant case */
1680 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1681 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1682 __FUNCTION__);
1683
1684 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1685 }
1686
1687
1688 /**
1689 * Generate exp(x)
1690 */
1691 LLVMValueRef
1692 lp_build_exp(struct lp_build_context *bld,
1693 LLVMValueRef x)
1694 {
1695 /* log2(e) = 1/log(2) */
1696 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1697
1698 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1699 }
1700
1701
1702 /**
1703 * Generate log(x)
1704 */
1705 LLVMValueRef
1706 lp_build_log(struct lp_build_context *bld,
1707 LLVMValueRef x)
1708 {
1709 /* log(2) */
1710 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1711
1712 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1713 }
1714
1715
1716 #define EXP_POLY_DEGREE 3
1717 #define LOG_POLY_DEGREE 5
1718
1719
1720 /**
1721 * Generate polynomial.
1722 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1723 */
1724 static LLVMValueRef
1725 lp_build_polynomial(struct lp_build_context *bld,
1726 LLVMValueRef x,
1727 const double *coeffs,
1728 unsigned num_coeffs)
1729 {
1730 const struct lp_type type = bld->type;
1731 LLVMValueRef res = NULL;
1732 unsigned i;
1733
1734 /* TODO: optimize the constant case */
1735 if(LLVMIsConstant(x))
1736 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1737 __FUNCTION__);
1738
1739 for (i = num_coeffs; i--; ) {
1740 LLVMValueRef coeff;
1741
1742 coeff = lp_build_const_vec(type, coeffs[i]);
1743
1744 if(res)
1745 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1746 else
1747 res = coeff;
1748 }
1749
1750 if(res)
1751 return res;
1752 else
1753 return bld->undef;
1754 }
1755
1756
1757 /**
1758 * Minimax polynomial fit of 2**x, in range [0, 1[
1759 */
1760 const double lp_build_exp2_polynomial[] = {
1761 #if EXP_POLY_DEGREE == 5
1762 0.999999999690134838155,
1763 0.583974334321735217258,
1764 0.164553105719676828492,
1765 0.0292811063701710962255,
1766 0.00354944426657875141846,
1767 0.000296253726543423377365
1768 #elif EXP_POLY_DEGREE == 4
1769 1.00000001502262084505,
1770 0.563586057338685991394,
1771 0.150436017652442413623,
1772 0.0243220604213317927308,
1773 0.0025359088446580436489
1774 #elif EXP_POLY_DEGREE == 3
1775 0.999925218562710312959,
1776 0.695833540494823811697,
1777 0.226067155427249155588,
1778 0.0780245226406372992967
1779 #elif EXP_POLY_DEGREE == 2
1780 1.00172476321474503578,
1781 0.657636275736077639316,
1782 0.33718943461968720704
1783 #else
1784 #error
1785 #endif
1786 };
1787
1788
1789 void
1790 lp_build_exp2_approx(struct lp_build_context *bld,
1791 LLVMValueRef x,
1792 LLVMValueRef *p_exp2_int_part,
1793 LLVMValueRef *p_frac_part,
1794 LLVMValueRef *p_exp2)
1795 {
1796 const struct lp_type type = bld->type;
1797 LLVMTypeRef vec_type = lp_build_vec_type(type);
1798 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1799 LLVMValueRef ipart = NULL;
1800 LLVMValueRef fpart = NULL;
1801 LLVMValueRef expipart = NULL;
1802 LLVMValueRef expfpart = NULL;
1803 LLVMValueRef res = NULL;
1804
1805 if(p_exp2_int_part || p_frac_part || p_exp2) {
1806 /* TODO: optimize the constant case */
1807 if(LLVMIsConstant(x))
1808 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1809 __FUNCTION__);
1810
1811 assert(type.floating && type.width == 32);
1812
1813 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1814 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1815
1816 /* ipart = floor(x) */
1817 ipart = lp_build_floor(bld, x);
1818
1819 /* fpart = x - ipart */
1820 fpart = LLVMBuildSub(bld->builder, x, ipart, "");
1821 }
1822
1823 if(p_exp2_int_part || p_exp2) {
1824 /* expipart = (float) (1 << ipart) */
1825 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1826 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1827 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1828 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1829 }
1830
1831 if(p_exp2) {
1832 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1833 Elements(lp_build_exp2_polynomial));
1834
1835 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1836 }
1837
1838 if(p_exp2_int_part)
1839 *p_exp2_int_part = expipart;
1840
1841 if(p_frac_part)
1842 *p_frac_part = fpart;
1843
1844 if(p_exp2)
1845 *p_exp2 = res;
1846 }
1847
1848
1849 LLVMValueRef
1850 lp_build_exp2(struct lp_build_context *bld,
1851 LLVMValueRef x)
1852 {
1853 LLVMValueRef res;
1854 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1855 return res;
1856 }
1857
1858
1859 /**
1860 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1861 * These coefficients can be generate with
1862 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1863 */
1864 const double lp_build_log2_polynomial[] = {
1865 #if LOG_POLY_DEGREE == 6
1866 3.11578814719469302614,
1867 -3.32419399085241980044,
1868 2.59883907202499966007,
1869 -1.23152682416275988241,
1870 0.318212422185251071475,
1871 -0.0344359067839062357313
1872 #elif LOG_POLY_DEGREE == 5
1873 2.8882704548164776201,
1874 -2.52074962577807006663,
1875 1.48116647521213171641,
1876 -0.465725644288844778798,
1877 0.0596515482674574969533
1878 #elif LOG_POLY_DEGREE == 4
1879 2.61761038894603480148,
1880 -1.75647175389045657003,
1881 0.688243882994381274313,
1882 -0.107254423828329604454
1883 #elif LOG_POLY_DEGREE == 3
1884 2.28330284476918490682,
1885 -1.04913055217340124191,
1886 0.204446009836232697516
1887 #else
1888 #error
1889 #endif
1890 };
1891
1892
1893 /**
1894 * See http://www.devmaster.net/forums/showthread.php?p=43580
1895 */
1896 void
1897 lp_build_log2_approx(struct lp_build_context *bld,
1898 LLVMValueRef x,
1899 LLVMValueRef *p_exp,
1900 LLVMValueRef *p_floor_log2,
1901 LLVMValueRef *p_log2)
1902 {
1903 const struct lp_type type = bld->type;
1904 LLVMTypeRef vec_type = lp_build_vec_type(type);
1905 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1906
1907 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1908 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1909 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1910
1911 LLVMValueRef i = NULL;
1912 LLVMValueRef exp = NULL;
1913 LLVMValueRef mant = NULL;
1914 LLVMValueRef logexp = NULL;
1915 LLVMValueRef logmant = NULL;
1916 LLVMValueRef res = NULL;
1917
1918 if(p_exp || p_floor_log2 || p_log2) {
1919 /* TODO: optimize the constant case */
1920 if(LLVMIsConstant(x))
1921 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1922 __FUNCTION__);
1923
1924 assert(type.floating && type.width == 32);
1925
1926 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1927
1928 /* exp = (float) exponent(x) */
1929 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1930 }
1931
1932 if(p_floor_log2 || p_log2) {
1933 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1934 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1935 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1936 }
1937
1938 if(p_log2) {
1939 /* mant = (float) mantissa(x) */
1940 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1941 mant = LLVMBuildOr(bld->builder, mant, one, "");
1942 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1943
1944 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1945 Elements(lp_build_log2_polynomial));
1946
1947 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1948 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1949
1950 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1951 }
1952
1953 if(p_exp) {
1954 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
1955 *p_exp = exp;
1956 }
1957
1958 if(p_floor_log2)
1959 *p_floor_log2 = logexp;
1960
1961 if(p_log2)
1962 *p_log2 = res;
1963 }
1964
1965
1966 LLVMValueRef
1967 lp_build_log2(struct lp_build_context *bld,
1968 LLVMValueRef x)
1969 {
1970 LLVMValueRef res;
1971 lp_build_log2_approx(bld, x, NULL, NULL, &res);
1972 return res;
1973 }