Merge branch 'mesa_7_5_branch' into mesa_7_6_branch
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_string.h"
51
52 #include "lp_bld_type.h"
53 #include "lp_bld_const.h"
54 #include "lp_bld_intr.h"
55 #include "lp_bld_logic.h"
56 #include "lp_bld_arit.h"
57
58
59 /**
60 * Generate min(a, b)
61 * No checks for special case values of a or b = 1 or 0 are done.
62 */
63 static LLVMValueRef
64 lp_build_min_simple(struct lp_build_context *bld,
65 LLVMValueRef a,
66 LLVMValueRef b)
67 {
68 const union lp_type type = bld->type;
69 const char *intrinsic = NULL;
70 LLVMValueRef cond;
71
72 /* TODO: optimize the constant case */
73
74 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
75 if(type.width * type.length == 128) {
76 if(type.floating) {
77 if(type.width == 32)
78 intrinsic = "llvm.x86.sse.min.ps";
79 if(type.width == 64)
80 intrinsic = "llvm.x86.sse2.min.pd";
81 }
82 else {
83 if(type.width == 8 && !type.sign)
84 intrinsic = "llvm.x86.sse2.pminu.b";
85 if(type.width == 8 && type.sign)
86 intrinsic = "llvm.x86.sse41.pminsb";
87 if(type.width == 16 && !type.sign)
88 intrinsic = "llvm.x86.sse41.pminuw";
89 if(type.width == 16 && type.sign)
90 intrinsic = "llvm.x86.sse2.pmins.w";
91 if(type.width == 32 && !type.sign)
92 intrinsic = "llvm.x86.sse41.pminud";
93 if(type.width == 32 && type.sign)
94 intrinsic = "llvm.x86.sse41.pminsd";
95 }
96 }
97 #endif
98
99 if(intrinsic)
100 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
101
102 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
103 return lp_build_select(bld, cond, a, b);
104 }
105
106
107 /**
108 * Generate max(a, b)
109 * No checks for special case values of a or b = 1 or 0 are done.
110 */
111 static LLVMValueRef
112 lp_build_max_simple(struct lp_build_context *bld,
113 LLVMValueRef a,
114 LLVMValueRef b)
115 {
116 const union lp_type type = bld->type;
117 const char *intrinsic = NULL;
118 LLVMValueRef cond;
119
120 /* TODO: optimize the constant case */
121
122 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
123 if(type.width * type.length == 128) {
124 if(type.floating) {
125 if(type.width == 32)
126 intrinsic = "llvm.x86.sse.max.ps";
127 if(type.width == 64)
128 intrinsic = "llvm.x86.sse2.max.pd";
129 }
130 else {
131 if(type.width == 8 && !type.sign)
132 intrinsic = "llvm.x86.sse2.pmaxu.b";
133 if(type.width == 8 && type.sign)
134 intrinsic = "llvm.x86.sse41.pmaxsb";
135 if(type.width == 16 && !type.sign)
136 intrinsic = "llvm.x86.sse41.pmaxuw";
137 if(type.width == 16 && type.sign)
138 intrinsic = "llvm.x86.sse2.pmaxs.w";
139 if(type.width == 32 && !type.sign)
140 intrinsic = "llvm.x86.sse41.pmaxud";
141 if(type.width == 32 && type.sign)
142 intrinsic = "llvm.x86.sse41.pmaxsd";
143 }
144 }
145 #endif
146
147 if(intrinsic)
148 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
149
150 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
151 return lp_build_select(bld, cond, a, b);
152 }
153
154
155 /**
156 * Generate 1 - a, or ~a depending on bld->type.
157 */
158 LLVMValueRef
159 lp_build_comp(struct lp_build_context *bld,
160 LLVMValueRef a)
161 {
162 const union lp_type type = bld->type;
163
164 if(a == bld->one)
165 return bld->zero;
166 if(a == bld->zero)
167 return bld->one;
168
169 if(type.norm && !type.floating && !type.fixed && !type.sign) {
170 if(LLVMIsConstant(a))
171 return LLVMConstNot(a);
172 else
173 return LLVMBuildNot(bld->builder, a, "");
174 }
175
176 if(LLVMIsConstant(a))
177 return LLVMConstSub(bld->one, a);
178 else
179 return LLVMBuildSub(bld->builder, bld->one, a, "");
180 }
181
182
183 /**
184 * Generate a + b
185 */
186 LLVMValueRef
187 lp_build_add(struct lp_build_context *bld,
188 LLVMValueRef a,
189 LLVMValueRef b)
190 {
191 const union lp_type type = bld->type;
192 LLVMValueRef res;
193
194 if(a == bld->zero)
195 return b;
196 if(b == bld->zero)
197 return a;
198 if(a == bld->undef || b == bld->undef)
199 return bld->undef;
200
201 if(bld->type.norm) {
202 const char *intrinsic = NULL;
203
204 if(a == bld->one || b == bld->one)
205 return bld->one;
206
207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
208 if(type.width * type.length == 128 &&
209 !type.floating && !type.fixed) {
210 if(type.width == 8)
211 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
212 if(type.width == 16)
213 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
214 }
215 #endif
216
217 if(intrinsic)
218 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
219 }
220
221 if(LLVMIsConstant(a) && LLVMIsConstant(b))
222 res = LLVMConstAdd(a, b);
223 else
224 res = LLVMBuildAdd(bld->builder, a, b, "");
225
226 /* clamp to ceiling of 1.0 */
227 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
228 res = lp_build_min_simple(bld, res, bld->one);
229
230 /* XXX clamp to floor of -1 or 0??? */
231
232 return res;
233 }
234
235
236 /**
237 * Generate a - b
238 */
239 LLVMValueRef
240 lp_build_sub(struct lp_build_context *bld,
241 LLVMValueRef a,
242 LLVMValueRef b)
243 {
244 const union lp_type type = bld->type;
245 LLVMValueRef res;
246
247 if(b == bld->zero)
248 return a;
249 if(a == bld->undef || b == bld->undef)
250 return bld->undef;
251 if(a == b)
252 return bld->zero;
253
254 if(bld->type.norm) {
255 const char *intrinsic = NULL;
256
257 if(b == bld->one)
258 return bld->zero;
259
260 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
261 if(type.width * type.length == 128 &&
262 !type.floating && !type.fixed) {
263 if(type.width == 8)
264 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
265 if(type.width == 16)
266 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
267 }
268 #endif
269
270 if(intrinsic)
271 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
272 }
273
274 if(LLVMIsConstant(a) && LLVMIsConstant(b))
275 res = LLVMConstSub(a, b);
276 else
277 res = LLVMBuildSub(bld->builder, a, b, "");
278
279 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
280 res = lp_build_max_simple(bld, res, bld->zero);
281
282 return res;
283 }
284
285
286 /**
287 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
288 */
289 static LLVMValueRef
290 lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
291 {
292 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
293 unsigned i, j;
294
295 assert(n <= LP_MAX_VECTOR_LENGTH);
296 assert(lo_hi < 2);
297
298 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
299 elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
300 elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
301 }
302
303 return LLVMConstVector(elems, n);
304 }
305
306
307 /**
308 * Build constant int vector of width 'n' and value 'c'.
309 */
310 static LLVMValueRef
311 lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
312 {
313 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
314 unsigned i;
315
316 assert(n <= LP_MAX_VECTOR_LENGTH);
317
318 for(i = 0; i < n; ++i)
319 elems[i] = LLVMConstInt(type, c, 0);
320
321 return LLVMConstVector(elems, n);
322 }
323
324
325 /**
326 * Normalized 8bit multiplication.
327 *
328 * - alpha plus one
329 *
330 * makes the following approximation to the division (Sree)
331 *
332 * a*b/255 ~= (a*(b + 1)) >> 256
333 *
334 * which is the fastest method that satisfies the following OpenGL criteria
335 *
336 * 0*0 = 0 and 255*255 = 255
337 *
338 * - geometric series
339 *
340 * takes the geometric series approximation to the division
341 *
342 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
343 *
344 * in this case just the first two terms to fit in 16bit arithmetic
345 *
346 * t/255 ~= (t + (t >> 8)) >> 8
347 *
348 * note that just by itself it doesn't satisfies the OpenGL criteria, as
349 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
350 * must be used
351 *
352 * - geometric series plus rounding
353 *
354 * when using a geometric series division instead of truncating the result
355 * use roundoff in the approximation (Jim Blinn)
356 *
357 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
358 *
359 * achieving the exact results
360 *
361 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
362 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
363 * @sa Michael Herf, The "double blend trick", May 2000,
364 * http://www.stereopsis.com/doubleblend.html
365 */
366 static LLVMValueRef
367 lp_build_mul_u8n(LLVMBuilderRef builder,
368 LLVMValueRef a, LLVMValueRef b)
369 {
370 static LLVMValueRef c01 = NULL;
371 static LLVMValueRef c08 = NULL;
372 static LLVMValueRef c80 = NULL;
373 LLVMValueRef ab;
374
375 if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
376 if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
377 if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
378
379 #if 0
380
381 /* a*b/255 ~= (a*(b + 1)) >> 256 */
382 b = LLVMBuildAdd(builder, b, c01, "");
383 ab = LLVMBuildMul(builder, a, b, "");
384
385 #else
386
387 /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
388 ab = LLVMBuildMul(builder, a, b, "");
389 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
390 ab = LLVMBuildAdd(builder, ab, c80, "");
391
392 #endif
393
394 ab = LLVMBuildLShr(builder, ab, c08, "");
395
396 return ab;
397 }
398
399
400 /**
401 * Generate a * b
402 */
403 LLVMValueRef
404 lp_build_mul(struct lp_build_context *bld,
405 LLVMValueRef a,
406 LLVMValueRef b)
407 {
408 const union lp_type type = bld->type;
409
410 if(a == bld->zero)
411 return bld->zero;
412 if(a == bld->one)
413 return b;
414 if(b == bld->zero)
415 return bld->zero;
416 if(b == bld->one)
417 return a;
418 if(a == bld->undef || b == bld->undef)
419 return bld->undef;
420
421 if(!type.floating && !type.fixed && type.norm) {
422 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
423 if(type.width == 8 && type.length == 16) {
424 LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
425 LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
426 static LLVMValueRef ml = NULL;
427 static LLVMValueRef mh = NULL;
428 LLVMValueRef al, ah, bl, bh;
429 LLVMValueRef abl, abh;
430 LLVMValueRef ab;
431
432 if(!ml) ml = lp_build_unpack_shuffle(16, 0);
433 if(!mh) mh = lp_build_unpack_shuffle(16, 1);
434
435 /* PUNPCKLBW, PUNPCKHBW */
436 al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
437 bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
438 ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
439 bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
440
441 /* NOP */
442 al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
443 bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
444 ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
445 bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
446
447 /* PMULLW, PSRLW, PADDW */
448 abl = lp_build_mul_u8n(bld->builder, al, bl);
449 abh = lp_build_mul_u8n(bld->builder, ah, bh);
450
451 /* PACKUSWB */
452 ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
453
454 /* NOP */
455 ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
456
457 return ab;
458 }
459 #endif
460
461 /* FIXME */
462 assert(0);
463 }
464
465 if(LLVMIsConstant(a) && LLVMIsConstant(b))
466 return LLVMConstMul(a, b);
467
468 return LLVMBuildMul(bld->builder, a, b, "");
469 }
470
471
472 /**
473 * Generate a / b
474 */
475 LLVMValueRef
476 lp_build_div(struct lp_build_context *bld,
477 LLVMValueRef a,
478 LLVMValueRef b)
479 {
480 const union lp_type type = bld->type;
481
482 if(a == bld->zero)
483 return bld->zero;
484 if(a == bld->one)
485 return lp_build_rcp(bld, b);
486 if(b == bld->zero)
487 return bld->undef;
488 if(b == bld->one)
489 return a;
490 if(a == bld->undef || b == bld->undef)
491 return bld->undef;
492
493 if(LLVMIsConstant(a) && LLVMIsConstant(b))
494 return LLVMConstFDiv(a, b);
495
496 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
497 if(type.width == 32 && type.length == 4)
498 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
499 #endif
500
501 return LLVMBuildFDiv(bld->builder, a, b, "");
502 }
503
504
505 /**
506 * Generate min(a, b)
507 * Do checks for special cases.
508 */
509 LLVMValueRef
510 lp_build_min(struct lp_build_context *bld,
511 LLVMValueRef a,
512 LLVMValueRef b)
513 {
514 if(a == bld->undef || b == bld->undef)
515 return bld->undef;
516
517 if(a == b)
518 return a;
519
520 if(bld->type.norm) {
521 if(a == bld->zero || b == bld->zero)
522 return bld->zero;
523 if(a == bld->one)
524 return b;
525 if(b == bld->one)
526 return a;
527 }
528
529 return lp_build_min_simple(bld, a, b);
530 }
531
532
533 /**
534 * Generate max(a, b)
535 * Do checks for special cases.
536 */
537 LLVMValueRef
538 lp_build_max(struct lp_build_context *bld,
539 LLVMValueRef a,
540 LLVMValueRef b)
541 {
542 if(a == bld->undef || b == bld->undef)
543 return bld->undef;
544
545 if(a == b)
546 return a;
547
548 if(bld->type.norm) {
549 if(a == bld->one || b == bld->one)
550 return bld->one;
551 if(a == bld->zero)
552 return b;
553 if(b == bld->zero)
554 return a;
555 }
556
557 return lp_build_max_simple(bld, a, b);
558 }
559
560
561 /**
562 * Generate abs(a)
563 */
564 LLVMValueRef
565 lp_build_abs(struct lp_build_context *bld,
566 LLVMValueRef a)
567 {
568 const union lp_type type = bld->type;
569
570 if(!type.sign)
571 return a;
572
573 /* XXX: is this really necessary? */
574 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
575 if(!type.floating && type.width*type.length == 128) {
576 LLVMTypeRef vec_type = lp_build_vec_type(type);
577 if(type.width == 8)
578 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
579 if(type.width == 16)
580 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
581 if(type.width == 32)
582 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
583 }
584 #endif
585
586 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
587 }
588
589
590 LLVMValueRef
591 lp_build_sqrt(struct lp_build_context *bld,
592 LLVMValueRef a)
593 {
594 const union lp_type type = bld->type;
595 LLVMTypeRef vec_type = lp_build_vec_type(type);
596 char intrinsic[32];
597
598 /* TODO: optimize the constant case */
599 /* TODO: optimize the constant case */
600
601 assert(type.floating);
602 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
603
604 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
605 }
606
607
608 LLVMValueRef
609 lp_build_rcp(struct lp_build_context *bld,
610 LLVMValueRef a)
611 {
612 const union lp_type type = bld->type;
613
614 if(a == bld->zero)
615 return bld->undef;
616 if(a == bld->one)
617 return bld->one;
618 if(a == bld->undef)
619 return bld->undef;
620
621 assert(type.floating);
622
623 if(LLVMIsConstant(a))
624 return LLVMConstFDiv(bld->one, a);
625
626 /* XXX: is this really necessary? */
627 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
628 if(type.width == 32 && type.length == 4)
629 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
630 #endif
631
632 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
633 }
634
635
636 /**
637 * Generate 1/sqrt(a)
638 */
639 LLVMValueRef
640 lp_build_rsqrt(struct lp_build_context *bld,
641 LLVMValueRef a)
642 {
643 const union lp_type type = bld->type;
644
645 assert(type.floating);
646
647 /* XXX: is this really necessary? */
648 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
649 if(type.width == 32 && type.length == 4)
650 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
651 #endif
652
653 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
654 }
655
656
657 /**
658 * Generate cos(a)
659 */
660 LLVMValueRef
661 lp_build_cos(struct lp_build_context *bld,
662 LLVMValueRef a)
663 {
664 const union lp_type type = bld->type;
665 LLVMTypeRef vec_type = lp_build_vec_type(type);
666 char intrinsic[32];
667
668 /* TODO: optimize the constant case */
669
670 assert(type.floating);
671 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
672
673 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
674 }
675
676
677 /**
678 * Generate sin(a)
679 */
680 LLVMValueRef
681 lp_build_sin(struct lp_build_context *bld,
682 LLVMValueRef a)
683 {
684 const union lp_type type = bld->type;
685 LLVMTypeRef vec_type = lp_build_vec_type(type);
686 char intrinsic[32];
687
688 /* TODO: optimize the constant case */
689
690 assert(type.floating);
691 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
692
693 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
694 }
695
696
697 /**
698 * Generate pow(x, y)
699 */
700 LLVMValueRef
701 lp_build_pow(struct lp_build_context *bld,
702 LLVMValueRef x,
703 LLVMValueRef y)
704 {
705 /* TODO: optimize the constant case */
706 if(LLVMIsConstant(x) && LLVMIsConstant(y))
707 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
708
709 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
710 }
711
712
713 /**
714 * Generate exp(x)
715 */
716 LLVMValueRef
717 lp_build_exp(struct lp_build_context *bld,
718 LLVMValueRef x)
719 {
720 /* log2(e) = 1/log(2) */
721 LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
722
723 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
724 }
725
726
727 /**
728 * Generate log(x)
729 */
730 LLVMValueRef
731 lp_build_log(struct lp_build_context *bld,
732 LLVMValueRef x)
733 {
734 /* log(2) */
735 LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
736
737 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
738 }
739
740
741 #define EXP_POLY_DEGREE 3
742 #define LOG_POLY_DEGREE 5
743
744
745 /**
746 * Generate polynomial.
747 * Ex: x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
748 */
749 static LLVMValueRef
750 lp_build_polynomial(struct lp_build_context *bld,
751 LLVMValueRef x,
752 const double *coeffs,
753 unsigned num_coeffs)
754 {
755 const union lp_type type = bld->type;
756 LLVMValueRef res = NULL;
757 unsigned i;
758
759 /* TODO: optimize the constant case */
760 if(LLVMIsConstant(x))
761 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
762
763 for (i = num_coeffs; i--; ) {
764 LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
765 if(res)
766 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
767 else
768 res = coeff;
769 }
770
771 if(res)
772 return res;
773 else
774 return bld->undef;
775 }
776
777
778 /**
779 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
780 */
781 const double lp_build_exp2_polynomial[] = {
782 #if EXP_POLY_DEGREE == 5
783 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
784 #elif EXP_POLY_DEGREE == 4
785 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
786 #elif EXP_POLY_DEGREE == 3
787 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
788 #elif EXP_POLY_DEGREE == 2
789 1.0017247, 6.5763628e-1, 3.3718944e-1
790 #else
791 #error
792 #endif
793 };
794
795
796 void
797 lp_build_exp2_approx(struct lp_build_context *bld,
798 LLVMValueRef x,
799 LLVMValueRef *p_exp2_int_part,
800 LLVMValueRef *p_frac_part,
801 LLVMValueRef *p_exp2)
802 {
803 const union lp_type type = bld->type;
804 LLVMTypeRef vec_type = lp_build_vec_type(type);
805 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
806 LLVMValueRef ipart = NULL;
807 LLVMValueRef fpart = NULL;
808 LLVMValueRef expipart = NULL;
809 LLVMValueRef expfpart = NULL;
810 LLVMValueRef res = NULL;
811
812 if(p_exp2_int_part || p_frac_part || p_exp2) {
813 /* TODO: optimize the constant case */
814 if(LLVMIsConstant(x))
815 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
816
817 assert(type.floating && type.width == 32);
818
819 x = lp_build_min(bld, x, lp_build_const_scalar(type, 129.0));
820 x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
821
822 /* ipart = int(x - 0.5) */
823 ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
824 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
825
826 /* fpart = x - ipart */
827 fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
828 fpart = LLVMBuildSub(bld->builder, x, fpart, "");
829 }
830
831 if(p_exp2_int_part || p_exp2) {
832 /* expipart = (float) (1 << ipart) */
833 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
834 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
835 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
836 }
837
838 if(p_exp2) {
839 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
840 Elements(lp_build_exp2_polynomial));
841
842 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
843 }
844
845 if(p_exp2_int_part)
846 *p_exp2_int_part = expipart;
847
848 if(p_frac_part)
849 *p_frac_part = fpart;
850
851 if(p_exp2)
852 *p_exp2 = res;
853 }
854
855
856 LLVMValueRef
857 lp_build_exp2(struct lp_build_context *bld,
858 LLVMValueRef x)
859 {
860 LLVMValueRef res;
861 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
862 return res;
863 }
864
865
866 /**
867 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
868 * These coefficients can be generate with
869 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
870 */
871 const double lp_build_log2_polynomial[] = {
872 #if LOG_POLY_DEGREE == 6
873 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
874 #elif LOG_POLY_DEGREE == 5
875 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
876 #elif LOG_POLY_DEGREE == 4
877 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
878 #elif LOG_POLY_DEGREE == 3
879 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
880 #else
881 #error
882 #endif
883 };
884
885
886 /**
887 * See http://www.devmaster.net/forums/showthread.php?p=43580
888 */
889 void
890 lp_build_log2_approx(struct lp_build_context *bld,
891 LLVMValueRef x,
892 LLVMValueRef *p_exp,
893 LLVMValueRef *p_floor_log2,
894 LLVMValueRef *p_log2)
895 {
896 const union lp_type type = bld->type;
897 LLVMTypeRef vec_type = lp_build_vec_type(type);
898 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
899
900 LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
901 LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
902 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
903
904 LLVMValueRef i = NULL;
905 LLVMValueRef exp = NULL;
906 LLVMValueRef mant = NULL;
907 LLVMValueRef logexp = NULL;
908 LLVMValueRef logmant = NULL;
909 LLVMValueRef res = NULL;
910
911 if(p_exp || p_floor_log2 || p_log2) {
912 /* TODO: optimize the constant case */
913 if(LLVMIsConstant(x))
914 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
915
916 assert(type.floating && type.width == 32);
917
918 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
919
920 /* exp = (float) exponent(x) */
921 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
922 }
923
924 if(p_floor_log2 || p_log2) {
925 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
926 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
927 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
928 }
929
930 if(p_log2) {
931 /* mant = (float) mantissa(x) */
932 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
933 mant = LLVMBuildOr(bld->builder, mant, one, "");
934 mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
935
936 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
937 Elements(lp_build_log2_polynomial));
938
939 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
940 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
941
942 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
943 }
944
945 if(p_exp)
946 *p_exp = exp;
947
948 if(p_floor_log2)
949 *p_floor_log2 = logexp;
950
951 if(p_log2)
952 *p_log2 = res;
953 }
954
955
956 LLVMValueRef
957 lp_build_log2(struct lp_build_context *bld,
958 LLVMValueRef x)
959 {
960 LLVMValueRef res;
961 lp_build_log2_approx(bld, x, NULL, NULL, &res);
962 return res;
963 }