gallivm: Silent warning.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_init.h" /* for lp_build_engine */
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
62
63
64 /**
65 * Generate min(a, b)
66 * No checks for special case values of a or b = 1 or 0 are done.
67 */
68 static LLVMValueRef
69 lp_build_min_simple(struct lp_build_context *bld,
70 LLVMValueRef a,
71 LLVMValueRef b)
72 {
73 const struct lp_type type = bld->type;
74 const char *intrinsic = NULL;
75 LLVMValueRef cond;
76
77 /* TODO: optimize the constant case */
78
79 if(type.width * type.length == 128) {
80 if(type.floating) {
81 if(type.width == 32 && util_cpu_caps.has_sse)
82 intrinsic = "llvm.x86.sse.min.ps";
83 if(type.width == 64 && util_cpu_caps.has_sse2)
84 intrinsic = "llvm.x86.sse2.min.pd";
85 }
86 else {
87 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
88 intrinsic = "llvm.x86.sse2.pminu.b";
89 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
90 intrinsic = "llvm.x86.sse41.pminsb";
91 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
92 intrinsic = "llvm.x86.sse41.pminuw";
93 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
94 intrinsic = "llvm.x86.sse2.pmins.w";
95 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
96 intrinsic = "llvm.x86.sse41.pminud";
97 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
98 intrinsic = "llvm.x86.sse41.pminsd";
99 }
100 }
101
102 if(intrinsic)
103 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
104
105 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
106 return lp_build_select(bld, cond, a, b);
107 }
108
109
110 /**
111 * Generate max(a, b)
112 * No checks for special case values of a or b = 1 or 0 are done.
113 */
114 static LLVMValueRef
115 lp_build_max_simple(struct lp_build_context *bld,
116 LLVMValueRef a,
117 LLVMValueRef b)
118 {
119 const struct lp_type type = bld->type;
120 const char *intrinsic = NULL;
121 LLVMValueRef cond;
122
123 /* TODO: optimize the constant case */
124
125 if(type.width * type.length == 128) {
126 if(type.floating) {
127 if(type.width == 32 && util_cpu_caps.has_sse)
128 intrinsic = "llvm.x86.sse.max.ps";
129 if(type.width == 64 && util_cpu_caps.has_sse2)
130 intrinsic = "llvm.x86.sse2.max.pd";
131 }
132 else {
133 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
134 intrinsic = "llvm.x86.sse2.pmaxu.b";
135 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
136 intrinsic = "llvm.x86.sse41.pmaxsb";
137 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
138 intrinsic = "llvm.x86.sse41.pmaxuw";
139 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
140 intrinsic = "llvm.x86.sse2.pmaxs.w";
141 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
142 intrinsic = "llvm.x86.sse41.pmaxud";
143 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
144 intrinsic = "llvm.x86.sse41.pmaxsd";
145 }
146 }
147
148 if(intrinsic)
149 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
150
151 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
152 return lp_build_select(bld, cond, a, b);
153 }
154
155
156 /**
157 * Generate 1 - a, or ~a depending on bld->type.
158 */
159 LLVMValueRef
160 lp_build_comp(struct lp_build_context *bld,
161 LLVMValueRef a)
162 {
163 const struct lp_type type = bld->type;
164
165 if(a == bld->one)
166 return bld->zero;
167 if(a == bld->zero)
168 return bld->one;
169
170 if(type.norm && !type.floating && !type.fixed && !type.sign) {
171 if(LLVMIsConstant(a))
172 return LLVMConstNot(a);
173 else
174 return LLVMBuildNot(bld->builder, a, "");
175 }
176
177 if(LLVMIsConstant(a))
178 return LLVMConstSub(bld->one, a);
179 else
180 return LLVMBuildSub(bld->builder, bld->one, a, "");
181 }
182
183
184 /**
185 * Generate a + b
186 */
187 LLVMValueRef
188 lp_build_add(struct lp_build_context *bld,
189 LLVMValueRef a,
190 LLVMValueRef b)
191 {
192 const struct lp_type type = bld->type;
193 LLVMValueRef res;
194
195 if(a == bld->zero)
196 return b;
197 if(b == bld->zero)
198 return a;
199 if(a == bld->undef || b == bld->undef)
200 return bld->undef;
201
202 if(bld->type.norm) {
203 const char *intrinsic = NULL;
204
205 if(a == bld->one || b == bld->one)
206 return bld->one;
207
208 if(util_cpu_caps.has_sse2 &&
209 type.width * type.length == 128 &&
210 !type.floating && !type.fixed) {
211 if(type.width == 8)
212 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
213 if(type.width == 16)
214 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
215 }
216
217 if(intrinsic)
218 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
219 }
220
221 if(LLVMIsConstant(a) && LLVMIsConstant(b))
222 res = LLVMConstAdd(a, b);
223 else
224 res = LLVMBuildAdd(bld->builder, a, b, "");
225
226 /* clamp to ceiling of 1.0 */
227 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
228 res = lp_build_min_simple(bld, res, bld->one);
229
230 /* XXX clamp to floor of -1 or 0??? */
231
232 return res;
233 }
234
235
236 /** Return the sum of the elements of a */
237 LLVMValueRef
238 lp_build_sum_vector(struct lp_build_context *bld,
239 LLVMValueRef a)
240 {
241 const struct lp_type type = bld->type;
242 LLVMValueRef index, res;
243 unsigned i;
244
245 if (a == bld->zero)
246 return bld->zero;
247 if (a == bld->undef)
248 return bld->undef;
249 assert(type.length > 1);
250
251 assert(!bld->type.norm);
252
253 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
254 res = LLVMBuildExtractElement(bld->builder, a, index, "");
255
256 for (i = 1; i < type.length; i++) {
257 index = LLVMConstInt(LLVMInt32Type(), i, 0);
258 res = LLVMBuildAdd(bld->builder, res,
259 LLVMBuildExtractElement(bld->builder, a, index, ""),
260 "");
261 }
262
263 return res;
264 }
265
266
267 /**
268 * Generate a - b
269 */
270 LLVMValueRef
271 lp_build_sub(struct lp_build_context *bld,
272 LLVMValueRef a,
273 LLVMValueRef b)
274 {
275 const struct lp_type type = bld->type;
276 LLVMValueRef res;
277
278 if(b == bld->zero)
279 return a;
280 if(a == bld->undef || b == bld->undef)
281 return bld->undef;
282 if(a == b)
283 return bld->zero;
284
285 if(bld->type.norm) {
286 const char *intrinsic = NULL;
287
288 if(b == bld->one)
289 return bld->zero;
290
291 if(util_cpu_caps.has_sse2 &&
292 type.width * type.length == 128 &&
293 !type.floating && !type.fixed) {
294 if(type.width == 8)
295 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
296 if(type.width == 16)
297 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
298 }
299
300 if(intrinsic)
301 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
302 }
303
304 if(LLVMIsConstant(a) && LLVMIsConstant(b))
305 res = LLVMConstSub(a, b);
306 else
307 res = LLVMBuildSub(bld->builder, a, b, "");
308
309 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
310 res = lp_build_max_simple(bld, res, bld->zero);
311
312 return res;
313 }
314
315
316 /**
317 * Normalized 8bit multiplication.
318 *
319 * - alpha plus one
320 *
321 * makes the following approximation to the division (Sree)
322 *
323 * a*b/255 ~= (a*(b + 1)) >> 256
324 *
325 * which is the fastest method that satisfies the following OpenGL criteria
326 *
327 * 0*0 = 0 and 255*255 = 255
328 *
329 * - geometric series
330 *
331 * takes the geometric series approximation to the division
332 *
333 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
334 *
335 * in this case just the first two terms to fit in 16bit arithmetic
336 *
337 * t/255 ~= (t + (t >> 8)) >> 8
338 *
339 * note that just by itself it doesn't satisfies the OpenGL criteria, as
340 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
341 * must be used
342 *
343 * - geometric series plus rounding
344 *
345 * when using a geometric series division instead of truncating the result
346 * use roundoff in the approximation (Jim Blinn)
347 *
348 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
349 *
350 * achieving the exact results
351 *
352 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
353 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
354 * @sa Michael Herf, The "double blend trick", May 2000,
355 * http://www.stereopsis.com/doubleblend.html
356 */
357 static LLVMValueRef
358 lp_build_mul_u8n(LLVMBuilderRef builder,
359 struct lp_type i16_type,
360 LLVMValueRef a, LLVMValueRef b)
361 {
362 LLVMValueRef c8;
363 LLVMValueRef ab;
364
365 c8 = lp_build_const_int_vec(i16_type, 8);
366
367 #if 0
368
369 /* a*b/255 ~= (a*(b + 1)) >> 256 */
370 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
371 ab = LLVMBuildMul(builder, a, b, "");
372
373 #else
374
375 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
376 ab = LLVMBuildMul(builder, a, b, "");
377 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
378 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
379
380 #endif
381
382 ab = LLVMBuildLShr(builder, ab, c8, "");
383
384 return ab;
385 }
386
387
388 /**
389 * Generate a * b
390 */
391 LLVMValueRef
392 lp_build_mul(struct lp_build_context *bld,
393 LLVMValueRef a,
394 LLVMValueRef b)
395 {
396 const struct lp_type type = bld->type;
397 LLVMValueRef shift;
398 LLVMValueRef res;
399
400 if(a == bld->zero)
401 return bld->zero;
402 if(a == bld->one)
403 return b;
404 if(b == bld->zero)
405 return bld->zero;
406 if(b == bld->one)
407 return a;
408 if(a == bld->undef || b == bld->undef)
409 return bld->undef;
410
411 if(!type.floating && !type.fixed && type.norm) {
412 if(type.width == 8) {
413 struct lp_type i16_type = lp_wider_type(type);
414 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
415
416 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
417 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
418
419 /* PMULLW, PSRLW, PADDW */
420 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
421 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
422
423 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
424
425 return ab;
426 }
427
428 /* FIXME */
429 assert(0);
430 }
431
432 if(type.fixed)
433 shift = lp_build_const_int_vec(type, type.width/2);
434 else
435 shift = NULL;
436
437 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
438 res = LLVMConstMul(a, b);
439 if(shift) {
440 if(type.sign)
441 res = LLVMConstAShr(res, shift);
442 else
443 res = LLVMConstLShr(res, shift);
444 }
445 }
446 else {
447 res = LLVMBuildMul(bld->builder, a, b, "");
448 if(shift) {
449 if(type.sign)
450 res = LLVMBuildAShr(bld->builder, res, shift, "");
451 else
452 res = LLVMBuildLShr(bld->builder, res, shift, "");
453 }
454 }
455
456 return res;
457 }
458
459
460 /**
461 * Small vector x scale multiplication optimization.
462 */
463 LLVMValueRef
464 lp_build_mul_imm(struct lp_build_context *bld,
465 LLVMValueRef a,
466 int b)
467 {
468 LLVMValueRef factor;
469
470 if(b == 0)
471 return bld->zero;
472
473 if(b == 1)
474 return a;
475
476 if(b == -1)
477 return LLVMBuildNeg(bld->builder, a, "");
478
479 if(b == 2 && bld->type.floating)
480 return lp_build_add(bld, a, a);
481
482 if(util_is_pot(b)) {
483 unsigned shift = ffs(b) - 1;
484
485 if(bld->type.floating) {
486 #if 0
487 /*
488 * Power of two multiplication by directly manipulating the mantissa.
489 *
490 * XXX: This might not be always faster, it will introduce a small error
491 * for multiplication by zero, and it will produce wrong results
492 * for Inf and NaN.
493 */
494 unsigned mantissa = lp_mantissa(bld->type);
495 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
496 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
497 a = LLVMBuildAdd(bld->builder, a, factor, "");
498 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
499 return a;
500 #endif
501 }
502 else {
503 factor = lp_build_const_vec(bld->type, shift);
504 return LLVMBuildShl(bld->builder, a, factor, "");
505 }
506 }
507
508 factor = lp_build_const_vec(bld->type, (double)b);
509 return lp_build_mul(bld, a, factor);
510 }
511
512
513 /**
514 * Generate a / b
515 */
516 LLVMValueRef
517 lp_build_div(struct lp_build_context *bld,
518 LLVMValueRef a,
519 LLVMValueRef b)
520 {
521 const struct lp_type type = bld->type;
522
523 if(a == bld->zero)
524 return bld->zero;
525 if(a == bld->one)
526 return lp_build_rcp(bld, b);
527 if(b == bld->zero)
528 return bld->undef;
529 if(b == bld->one)
530 return a;
531 if(a == bld->undef || b == bld->undef)
532 return bld->undef;
533
534 if(LLVMIsConstant(a) && LLVMIsConstant(b))
535 return LLVMConstFDiv(a, b);
536
537 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
538 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
539
540 return LLVMBuildFDiv(bld->builder, a, b, "");
541 }
542
543
544 /**
545 * Linear interpolation.
546 *
547 * This also works for integer values with a few caveats.
548 *
549 * @sa http://www.stereopsis.com/doubleblend.html
550 */
551 LLVMValueRef
552 lp_build_lerp(struct lp_build_context *bld,
553 LLVMValueRef x,
554 LLVMValueRef v0,
555 LLVMValueRef v1)
556 {
557 LLVMValueRef delta;
558 LLVMValueRef res;
559
560 delta = lp_build_sub(bld, v1, v0);
561
562 res = lp_build_mul(bld, x, delta);
563
564 res = lp_build_add(bld, v0, res);
565
566 if(bld->type.fixed)
567 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
568 * but it will be wrong for other uses. Basically we need a more
569 * powerful lp_type, capable of further distinguishing the values
570 * interpretation from the value storage. */
571 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
572
573 return res;
574 }
575
576
577 LLVMValueRef
578 lp_build_lerp_2d(struct lp_build_context *bld,
579 LLVMValueRef x,
580 LLVMValueRef y,
581 LLVMValueRef v00,
582 LLVMValueRef v01,
583 LLVMValueRef v10,
584 LLVMValueRef v11)
585 {
586 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
587 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
588 return lp_build_lerp(bld, y, v0, v1);
589 }
590
591
592 /**
593 * Generate min(a, b)
594 * Do checks for special cases.
595 */
596 LLVMValueRef
597 lp_build_min(struct lp_build_context *bld,
598 LLVMValueRef a,
599 LLVMValueRef b)
600 {
601 if(a == bld->undef || b == bld->undef)
602 return bld->undef;
603
604 if(a == b)
605 return a;
606
607 if(bld->type.norm) {
608 if(a == bld->zero || b == bld->zero)
609 return bld->zero;
610 if(a == bld->one)
611 return b;
612 if(b == bld->one)
613 return a;
614 }
615
616 return lp_build_min_simple(bld, a, b);
617 }
618
619
620 /**
621 * Generate max(a, b)
622 * Do checks for special cases.
623 */
624 LLVMValueRef
625 lp_build_max(struct lp_build_context *bld,
626 LLVMValueRef a,
627 LLVMValueRef b)
628 {
629 if(a == bld->undef || b == bld->undef)
630 return bld->undef;
631
632 if(a == b)
633 return a;
634
635 if(bld->type.norm) {
636 if(a == bld->one || b == bld->one)
637 return bld->one;
638 if(a == bld->zero)
639 return b;
640 if(b == bld->zero)
641 return a;
642 }
643
644 return lp_build_max_simple(bld, a, b);
645 }
646
647
648 /**
649 * Generate clamp(a, min, max)
650 * Do checks for special cases.
651 */
652 LLVMValueRef
653 lp_build_clamp(struct lp_build_context *bld,
654 LLVMValueRef a,
655 LLVMValueRef min,
656 LLVMValueRef max)
657 {
658 a = lp_build_min(bld, a, max);
659 a = lp_build_max(bld, a, min);
660 return a;
661 }
662
663
664 /**
665 * Generate abs(a)
666 */
667 LLVMValueRef
668 lp_build_abs(struct lp_build_context *bld,
669 LLVMValueRef a)
670 {
671 const struct lp_type type = bld->type;
672 LLVMTypeRef vec_type = lp_build_vec_type(type);
673
674 if(!type.sign)
675 return a;
676
677 if(type.floating) {
678 /* Mask out the sign bit */
679 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
680 unsigned long long absMask = ~(1ULL << (type.width - 1));
681 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
682 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
683 a = LLVMBuildAnd(bld->builder, a, mask, "");
684 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
685 return a;
686 }
687
688 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
689 switch(type.width) {
690 case 8:
691 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
692 case 16:
693 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
694 case 32:
695 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
696 }
697 }
698
699 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
700 }
701
702
703 LLVMValueRef
704 lp_build_negate(struct lp_build_context *bld,
705 LLVMValueRef a)
706 {
707 return LLVMBuildNeg(bld->builder, a, "");
708 }
709
710
711 /** Return -1, 0 or +1 depending on the sign of a */
712 LLVMValueRef
713 lp_build_sgn(struct lp_build_context *bld,
714 LLVMValueRef a)
715 {
716 const struct lp_type type = bld->type;
717 LLVMValueRef cond;
718 LLVMValueRef res;
719
720 /* Handle non-zero case */
721 if(!type.sign) {
722 /* if not zero then sign must be positive */
723 res = bld->one;
724 }
725 else if(type.floating) {
726 LLVMTypeRef vec_type;
727 LLVMTypeRef int_type;
728 LLVMValueRef mask;
729 LLVMValueRef sign;
730 LLVMValueRef one;
731 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
732
733 int_type = lp_build_int_vec_type(type);
734 vec_type = lp_build_vec_type(type);
735 mask = lp_build_const_int_vec(type, maskBit);
736
737 /* Take the sign bit and add it to 1 constant */
738 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
739 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
740 one = LLVMConstBitCast(bld->one, int_type);
741 res = LLVMBuildOr(bld->builder, sign, one, "");
742 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
743 }
744 else
745 {
746 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
747 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
748 res = lp_build_select(bld, cond, bld->one, minus_one);
749 }
750
751 /* Handle zero */
752 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
753 res = lp_build_select(bld, cond, bld->zero, res);
754
755 return res;
756 }
757
758
759 /**
760 * Set the sign of float vector 'a' according to 'sign'.
761 * If sign==0, return abs(a).
762 * If sign==1, return -abs(a);
763 * Other values for sign produce undefined results.
764 */
765 LLVMValueRef
766 lp_build_set_sign(struct lp_build_context *bld,
767 LLVMValueRef a, LLVMValueRef sign)
768 {
769 const struct lp_type type = bld->type;
770 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
771 LLVMTypeRef vec_type = lp_build_vec_type(type);
772 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
773 LLVMValueRef mask = lp_build_const_int_vec(type,
774 ~((unsigned long long) 1 << (type.width - 1)));
775 LLVMValueRef val, res;
776
777 assert(type.floating);
778
779 /* val = reinterpret_cast<int>(a) */
780 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
781 /* val = val & mask */
782 val = LLVMBuildAnd(bld->builder, val, mask, "");
783 /* sign = sign << shift */
784 sign = LLVMBuildShl(bld->builder, sign, shift, "");
785 /* res = val | sign */
786 res = LLVMBuildOr(bld->builder, val, sign, "");
787 /* res = reinterpret_cast<float>(res) */
788 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
789
790 return res;
791 }
792
793
794 /**
795 * Convert vector of (or scalar) int to vector of (or scalar) float.
796 */
797 LLVMValueRef
798 lp_build_int_to_float(struct lp_build_context *bld,
799 LLVMValueRef a)
800 {
801 const struct lp_type type = bld->type;
802 LLVMTypeRef vec_type = lp_build_vec_type(type);
803
804 assert(type.floating);
805
806 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
807 }
808
809
810
811 enum lp_build_round_sse41_mode
812 {
813 LP_BUILD_ROUND_SSE41_NEAREST = 0,
814 LP_BUILD_ROUND_SSE41_FLOOR = 1,
815 LP_BUILD_ROUND_SSE41_CEIL = 2,
816 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
817 };
818
819
820 static INLINE LLVMValueRef
821 lp_build_round_sse41(struct lp_build_context *bld,
822 LLVMValueRef a,
823 enum lp_build_round_sse41_mode mode)
824 {
825 const struct lp_type type = bld->type;
826 LLVMTypeRef vec_type = lp_build_vec_type(type);
827 const char *intrinsic;
828
829 assert(type.floating);
830 assert(type.width*type.length == 128);
831 assert(lp_check_value(type, a));
832 assert(util_cpu_caps.has_sse4_1);
833
834 switch(type.width) {
835 case 32:
836 intrinsic = "llvm.x86.sse41.round.ps";
837 break;
838 case 64:
839 intrinsic = "llvm.x86.sse41.round.pd";
840 break;
841 default:
842 assert(0);
843 return bld->undef;
844 }
845
846 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
847 LLVMConstInt(LLVMInt32Type(), mode, 0));
848 }
849
850
851 LLVMValueRef
852 lp_build_trunc(struct lp_build_context *bld,
853 LLVMValueRef a)
854 {
855 const struct lp_type type = bld->type;
856
857 assert(type.floating);
858 assert(lp_check_value(type, a));
859
860 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
861 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
862 else {
863 LLVMTypeRef vec_type = lp_build_vec_type(type);
864 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
865 LLVMValueRef res;
866 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
867 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
868 return res;
869 }
870 }
871
872
873 LLVMValueRef
874 lp_build_round(struct lp_build_context *bld,
875 LLVMValueRef a)
876 {
877 const struct lp_type type = bld->type;
878
879 assert(type.floating);
880 assert(lp_check_value(type, a));
881
882 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
883 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
884 else {
885 LLVMTypeRef vec_type = lp_build_vec_type(type);
886 LLVMValueRef res;
887 res = lp_build_iround(bld, a);
888 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
889 return res;
890 }
891 }
892
893
894 LLVMValueRef
895 lp_build_floor(struct lp_build_context *bld,
896 LLVMValueRef a)
897 {
898 const struct lp_type type = bld->type;
899
900 assert(type.floating);
901 assert(lp_check_value(type, a));
902
903 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
904 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
905 else {
906 LLVMTypeRef vec_type = lp_build_vec_type(type);
907 LLVMValueRef res;
908 res = lp_build_ifloor(bld, a);
909 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
910 return res;
911 }
912 }
913
914
915 LLVMValueRef
916 lp_build_ceil(struct lp_build_context *bld,
917 LLVMValueRef a)
918 {
919 const struct lp_type type = bld->type;
920
921 assert(type.floating);
922 assert(lp_check_value(type, a));
923
924 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
925 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
926 else {
927 LLVMTypeRef vec_type = lp_build_vec_type(type);
928 LLVMValueRef res;
929 res = lp_build_iceil(bld, a);
930 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
931 return res;
932 }
933 }
934
935
936 /**
937 * Return fractional part of 'a' computed as a - floor(f)
938 * Typically used in texture coord arithmetic.
939 */
940 LLVMValueRef
941 lp_build_fract(struct lp_build_context *bld,
942 LLVMValueRef a)
943 {
944 assert(bld->type.floating);
945 return lp_build_sub(bld, a, lp_build_floor(bld, a));
946 }
947
948
949 /**
950 * Convert to integer, through whichever rounding method that's fastest,
951 * typically truncating toward zero.
952 */
953 LLVMValueRef
954 lp_build_itrunc(struct lp_build_context *bld,
955 LLVMValueRef a)
956 {
957 const struct lp_type type = bld->type;
958 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
959
960 assert(type.floating);
961 assert(lp_check_value(type, a));
962
963 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
964 }
965
966
967 /**
968 * Convert float[] to int[] with round().
969 */
970 LLVMValueRef
971 lp_build_iround(struct lp_build_context *bld,
972 LLVMValueRef a)
973 {
974 const struct lp_type type = bld->type;
975 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
976 LLVMValueRef res;
977
978 assert(type.floating);
979
980 assert(lp_check_value(type, a));
981
982 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
983 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
984 }
985 else {
986 LLVMTypeRef vec_type = lp_build_vec_type(type);
987 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
988 LLVMValueRef sign;
989 LLVMValueRef half;
990
991 /* get sign bit */
992 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
993 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
994
995 /* sign * 0.5 */
996 half = lp_build_const_vec(type, 0.5);
997 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
998 half = LLVMBuildOr(bld->builder, sign, half, "");
999 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1000
1001 res = LLVMBuildAdd(bld->builder, a, half, "");
1002 }
1003
1004 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1005
1006 return res;
1007 }
1008
1009
1010 /**
1011 * Convert float[] to int[] with floor().
1012 */
1013 LLVMValueRef
1014 lp_build_ifloor(struct lp_build_context *bld,
1015 LLVMValueRef a)
1016 {
1017 const struct lp_type type = bld->type;
1018 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1019 LLVMValueRef res;
1020
1021 assert(type.floating);
1022 assert(lp_check_value(type, a));
1023
1024 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1025 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1026 }
1027 else {
1028 /* Take the sign bit and add it to 1 constant */
1029 LLVMTypeRef vec_type = lp_build_vec_type(type);
1030 unsigned mantissa = lp_mantissa(type);
1031 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1032 LLVMValueRef sign;
1033 LLVMValueRef offset;
1034
1035 /* sign = a < 0 ? ~0 : 0 */
1036 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1037 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1038 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
1039 lp_build_name(sign, "floor.sign");
1040
1041 /* offset = -0.99999(9)f */
1042 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
1043 offset = LLVMConstBitCast(offset, int_vec_type);
1044
1045 /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1046 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1047 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
1048 lp_build_name(offset, "floor.offset");
1049
1050 res = LLVMBuildAdd(bld->builder, a, offset, "");
1051 lp_build_name(res, "floor.res");
1052 }
1053
1054 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1055 lp_build_name(res, "floor");
1056
1057 return res;
1058 }
1059
1060
1061 LLVMValueRef
1062 lp_build_iceil(struct lp_build_context *bld,
1063 LLVMValueRef a)
1064 {
1065 const struct lp_type type = bld->type;
1066 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1067 LLVMValueRef res;
1068
1069 assert(type.floating);
1070 assert(lp_check_value(type, a));
1071
1072 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1073 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1074 }
1075 else {
1076 /* TODO: mimic lp_build_ifloor() here */
1077 assert(0);
1078 res = bld->undef;
1079 }
1080
1081 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1082
1083 return res;
1084 }
1085
1086
1087 LLVMValueRef
1088 lp_build_sqrt(struct lp_build_context *bld,
1089 LLVMValueRef a)
1090 {
1091 const struct lp_type type = bld->type;
1092 LLVMTypeRef vec_type = lp_build_vec_type(type);
1093 char intrinsic[32];
1094
1095 /* TODO: optimize the constant case */
1096 /* TODO: optimize the constant case */
1097
1098 assert(type.floating);
1099 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1100
1101 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1102 }
1103
1104
1105 LLVMValueRef
1106 lp_build_rcp(struct lp_build_context *bld,
1107 LLVMValueRef a)
1108 {
1109 const struct lp_type type = bld->type;
1110
1111 if(a == bld->zero)
1112 return bld->undef;
1113 if(a == bld->one)
1114 return bld->one;
1115 if(a == bld->undef)
1116 return bld->undef;
1117
1118 assert(type.floating);
1119
1120 if(LLVMIsConstant(a))
1121 return LLVMConstFDiv(bld->one, a);
1122
1123 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1124 /*
1125 * XXX: Added precision is not always necessary, so only enable this
1126 * when we have a better system in place to track minimum precision.
1127 */
1128
1129 #if 0
1130 /*
1131 * Do one Newton-Raphson step to improve precision:
1132 *
1133 * x1 = (2 - a * rcp(a)) * rcp(a)
1134 */
1135
1136 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1137 LLVMValueRef rcp_a;
1138 LLVMValueRef res;
1139
1140 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1141
1142 res = LLVMBuildMul(bld->builder, a, rcp_a, "");
1143 res = LLVMBuildSub(bld->builder, two, res, "");
1144 res = LLVMBuildMul(bld->builder, res, rcp_a, "");
1145
1146 return rcp_a;
1147 #else
1148 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1149 #endif
1150 }
1151
1152 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1153 }
1154
1155
1156 /**
1157 * Generate 1/sqrt(a)
1158 */
1159 LLVMValueRef
1160 lp_build_rsqrt(struct lp_build_context *bld,
1161 LLVMValueRef a)
1162 {
1163 const struct lp_type type = bld->type;
1164
1165 assert(type.floating);
1166
1167 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1168 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1169
1170 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1171 }
1172
1173
1174 #ifdef PIPE_OS_WINDOWS
1175
1176 /*
1177 * XXX: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
1178 * which is neither efficient nor does the CRT linkage work on Windows
1179 * causing segmentation fault.
1180 *
1181 * XXX: With LLVM 2.7 both schemes cause an assertion failure.
1182 */
1183 static LLVMValueRef
1184 lp_build_sincos(struct lp_build_context *bld,
1185 const char *name,
1186 float (*func)(float),
1187 LLVMValueRef a)
1188 {
1189 LLVMModuleRef module =
1190 LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld->builder)));
1191 LLVMValueRef function;
1192 LLVMValueRef res;
1193 unsigned i;
1194
1195 assert(bld->type.floating);
1196 assert(bld->type.width == 32);
1197
1198 function = LLVMGetNamedFunction(module, name);
1199 if (!function) {
1200 LLVMTypeRef ret_type;
1201 LLVMTypeRef arg_types[1];
1202 LLVMTypeRef function_type;
1203
1204 ret_type = LLVMFloatType();
1205 arg_types[0] = LLVMFloatType();
1206 function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
1207 function = LLVMAddFunction(module, name, function_type);
1208
1209 LLVMSetFunctionCallConv(function, LLVMCCallConv);
1210 LLVMSetLinkage(function, LLVMPrivateLinkage);
1211
1212 assert(LLVMIsDeclaration(function));
1213
1214 LLVMAddGlobalMapping(lp_build_engine, function, func);
1215 }
1216
1217 res = bld->undef;
1218
1219 for (i = 0; i < bld->type.length; ++i) {
1220 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
1221 LLVMValueRef args[1];
1222 LLVMValueRef tmp;
1223
1224 args[0] = LLVMBuildExtractElement(bld->builder, a, index, "");
1225
1226 tmp = LLVMBuildCall(bld->builder, function, args, Elements(args), "");
1227
1228 res = LLVMBuildInsertElement(bld->builder, res, tmp, index, "");
1229 }
1230
1231 return res;
1232 }
1233
1234 static float c_cosf( float f )
1235 {
1236 return (float) cos( (double) f );
1237 }
1238
1239 static float c_sinf( float f )
1240 {
1241 return (float) sin( (double) f );
1242 }
1243
1244 LLVMValueRef
1245 lp_build_cos(struct lp_build_context *bld,
1246 LLVMValueRef a)
1247 {
1248 return lp_build_sincos(bld, "cosf", &c_cosf, a);
1249 }
1250
1251 LLVMValueRef
1252 lp_build_sin(struct lp_build_context *bld,
1253 LLVMValueRef a)
1254 {
1255 return lp_build_sincos(bld, "sinf", &c_sinf, a);
1256 }
1257
1258 #else /* !PIPE_OS_WINDOWS */
1259
1260 /**
1261 * Generate cos(a)
1262 */
1263 LLVMValueRef
1264 lp_build_cos(struct lp_build_context *bld,
1265 LLVMValueRef a)
1266 {
1267 const struct lp_type type = bld->type;
1268 LLVMTypeRef vec_type = lp_build_vec_type(type);
1269 char intrinsic[32];
1270
1271 /* TODO: optimize the constant case */
1272
1273 assert(type.floating);
1274 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
1275
1276 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1277 }
1278
1279
1280 /**
1281 * Generate sin(a)
1282 */
1283 LLVMValueRef
1284 lp_build_sin(struct lp_build_context *bld,
1285 LLVMValueRef a)
1286 {
1287 const struct lp_type type = bld->type;
1288 LLVMTypeRef vec_type = lp_build_vec_type(type);
1289 char intrinsic[32];
1290
1291 /* TODO: optimize the constant case */
1292
1293 assert(type.floating);
1294 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1295
1296 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1297 }
1298
1299 #endif /* !PIPE_OS_WINDOWS */
1300
1301
1302 /**
1303 * Generate pow(x, y)
1304 */
1305 LLVMValueRef
1306 lp_build_pow(struct lp_build_context *bld,
1307 LLVMValueRef x,
1308 LLVMValueRef y)
1309 {
1310 /* TODO: optimize the constant case */
1311 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1312 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1313 __FUNCTION__);
1314
1315 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1316 }
1317
1318
1319 /**
1320 * Generate exp(x)
1321 */
1322 LLVMValueRef
1323 lp_build_exp(struct lp_build_context *bld,
1324 LLVMValueRef x)
1325 {
1326 /* log2(e) = 1/log(2) */
1327 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1328
1329 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1330 }
1331
1332
1333 /**
1334 * Generate log(x)
1335 */
1336 LLVMValueRef
1337 lp_build_log(struct lp_build_context *bld,
1338 LLVMValueRef x)
1339 {
1340 /* log(2) */
1341 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1342
1343 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1344 }
1345
1346
1347 #define EXP_POLY_DEGREE 3
1348 #define LOG_POLY_DEGREE 5
1349
1350
1351 /**
1352 * Generate polynomial.
1353 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1354 */
1355 static LLVMValueRef
1356 lp_build_polynomial(struct lp_build_context *bld,
1357 LLVMValueRef x,
1358 const double *coeffs,
1359 unsigned num_coeffs)
1360 {
1361 const struct lp_type type = bld->type;
1362 LLVMValueRef res = NULL;
1363 unsigned i;
1364
1365 /* TODO: optimize the constant case */
1366 if(LLVMIsConstant(x))
1367 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1368 __FUNCTION__);
1369
1370 for (i = num_coeffs; i--; ) {
1371 LLVMValueRef coeff;
1372
1373 coeff = lp_build_const_vec(type, coeffs[i]);
1374
1375 if(res)
1376 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1377 else
1378 res = coeff;
1379 }
1380
1381 if(res)
1382 return res;
1383 else
1384 return bld->undef;
1385 }
1386
1387
1388 /**
1389 * Minimax polynomial fit of 2**x, in range [0, 1[
1390 */
1391 const double lp_build_exp2_polynomial[] = {
1392 #if EXP_POLY_DEGREE == 5
1393 0.999999999690134838155,
1394 0.583974334321735217258,
1395 0.164553105719676828492,
1396 0.0292811063701710962255,
1397 0.00354944426657875141846,
1398 0.000296253726543423377365
1399 #elif EXP_POLY_DEGREE == 4
1400 1.00000001502262084505,
1401 0.563586057338685991394,
1402 0.150436017652442413623,
1403 0.0243220604213317927308,
1404 0.0025359088446580436489
1405 #elif EXP_POLY_DEGREE == 3
1406 0.999925218562710312959,
1407 0.695833540494823811697,
1408 0.226067155427249155588,
1409 0.0780245226406372992967
1410 #elif EXP_POLY_DEGREE == 2
1411 1.00172476321474503578,
1412 0.657636275736077639316,
1413 0.33718943461968720704
1414 #else
1415 #error
1416 #endif
1417 };
1418
1419
1420 void
1421 lp_build_exp2_approx(struct lp_build_context *bld,
1422 LLVMValueRef x,
1423 LLVMValueRef *p_exp2_int_part,
1424 LLVMValueRef *p_frac_part,
1425 LLVMValueRef *p_exp2)
1426 {
1427 const struct lp_type type = bld->type;
1428 LLVMTypeRef vec_type = lp_build_vec_type(type);
1429 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1430 LLVMValueRef ipart = NULL;
1431 LLVMValueRef fpart = NULL;
1432 LLVMValueRef expipart = NULL;
1433 LLVMValueRef expfpart = NULL;
1434 LLVMValueRef res = NULL;
1435
1436 if(p_exp2_int_part || p_frac_part || p_exp2) {
1437 /* TODO: optimize the constant case */
1438 if(LLVMIsConstant(x))
1439 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1440 __FUNCTION__);
1441
1442 assert(type.floating && type.width == 32);
1443
1444 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1445 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1446
1447 /* ipart = floor(x) */
1448 ipart = lp_build_floor(bld, x);
1449
1450 /* fpart = x - ipart */
1451 fpart = LLVMBuildSub(bld->builder, x, ipart, "");
1452 }
1453
1454 if(p_exp2_int_part || p_exp2) {
1455 /* expipart = (float) (1 << ipart) */
1456 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1457 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1458 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1459 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1460 }
1461
1462 if(p_exp2) {
1463 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1464 Elements(lp_build_exp2_polynomial));
1465
1466 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1467 }
1468
1469 if(p_exp2_int_part)
1470 *p_exp2_int_part = expipart;
1471
1472 if(p_frac_part)
1473 *p_frac_part = fpart;
1474
1475 if(p_exp2)
1476 *p_exp2 = res;
1477 }
1478
1479
1480 LLVMValueRef
1481 lp_build_exp2(struct lp_build_context *bld,
1482 LLVMValueRef x)
1483 {
1484 LLVMValueRef res;
1485 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1486 return res;
1487 }
1488
1489
1490 /**
1491 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1492 * These coefficients can be generate with
1493 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1494 */
1495 const double lp_build_log2_polynomial[] = {
1496 #if LOG_POLY_DEGREE == 6
1497 3.11578814719469302614,
1498 -3.32419399085241980044,
1499 2.59883907202499966007,
1500 -1.23152682416275988241,
1501 0.318212422185251071475,
1502 -0.0344359067839062357313
1503 #elif LOG_POLY_DEGREE == 5
1504 2.8882704548164776201,
1505 -2.52074962577807006663,
1506 1.48116647521213171641,
1507 -0.465725644288844778798,
1508 0.0596515482674574969533
1509 #elif LOG_POLY_DEGREE == 4
1510 2.61761038894603480148,
1511 -1.75647175389045657003,
1512 0.688243882994381274313,
1513 -0.107254423828329604454
1514 #elif LOG_POLY_DEGREE == 3
1515 2.28330284476918490682,
1516 -1.04913055217340124191,
1517 0.204446009836232697516
1518 #else
1519 #error
1520 #endif
1521 };
1522
1523
1524 /**
1525 * See http://www.devmaster.net/forums/showthread.php?p=43580
1526 */
1527 void
1528 lp_build_log2_approx(struct lp_build_context *bld,
1529 LLVMValueRef x,
1530 LLVMValueRef *p_exp,
1531 LLVMValueRef *p_floor_log2,
1532 LLVMValueRef *p_log2)
1533 {
1534 const struct lp_type type = bld->type;
1535 LLVMTypeRef vec_type = lp_build_vec_type(type);
1536 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1537
1538 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1539 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1540 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1541
1542 LLVMValueRef i = NULL;
1543 LLVMValueRef exp = NULL;
1544 LLVMValueRef mant = NULL;
1545 LLVMValueRef logexp = NULL;
1546 LLVMValueRef logmant = NULL;
1547 LLVMValueRef res = NULL;
1548
1549 if(p_exp || p_floor_log2 || p_log2) {
1550 /* TODO: optimize the constant case */
1551 if(LLVMIsConstant(x))
1552 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1553 __FUNCTION__);
1554
1555 assert(type.floating && type.width == 32);
1556
1557 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1558
1559 /* exp = (float) exponent(x) */
1560 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1561 }
1562
1563 if(p_floor_log2 || p_log2) {
1564 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1565 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1566 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1567 }
1568
1569 if(p_log2) {
1570 /* mant = (float) mantissa(x) */
1571 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1572 mant = LLVMBuildOr(bld->builder, mant, one, "");
1573 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1574
1575 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1576 Elements(lp_build_log2_polynomial));
1577
1578 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1579 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1580
1581 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1582 }
1583
1584 if(p_exp) {
1585 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
1586 *p_exp = exp;
1587 }
1588
1589 if(p_floor_log2)
1590 *p_floor_log2 = logexp;
1591
1592 if(p_log2)
1593 *p_log2 = res;
1594 }
1595
1596
1597 LLVMValueRef
1598 lp_build_log2(struct lp_build_context *bld,
1599 LLVMValueRef x)
1600 {
1601 LLVMValueRef res;
1602 lp_build_log2_approx(bld, x, NULL, NULL, &res);
1603 return res;
1604 }