Merge branch 'master' into asm-shader-rework-2
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_string.h"
51
52 #include "lp_bld_type.h"
53 #include "lp_bld_const.h"
54 #include "lp_bld_intr.h"
55 #include "lp_bld_logic.h"
56 #include "lp_bld_arit.h"
57
58
59 /**
60 * Generate min(a, b)
61 * No checks for special case values of a or b = 1 or 0 are done.
62 */
63 static LLVMValueRef
64 lp_build_min_simple(struct lp_build_context *bld,
65 LLVMValueRef a,
66 LLVMValueRef b)
67 {
68 const union lp_type type = bld->type;
69 const char *intrinsic = NULL;
70 LLVMValueRef cond;
71
72 /* TODO: optimize the constant case */
73
74 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
75 if(type.width * type.length == 128) {
76 if(type.floating) {
77 if(type.width == 32)
78 intrinsic = "llvm.x86.sse.min.ps";
79 if(type.width == 64)
80 intrinsic = "llvm.x86.sse2.min.pd";
81 }
82 else {
83 if(type.width == 8 && !type.sign)
84 intrinsic = "llvm.x86.sse2.pminu.b";
85 if(type.width == 8 && type.sign)
86 intrinsic = "llvm.x86.sse41.pminsb";
87 if(type.width == 16 && !type.sign)
88 intrinsic = "llvm.x86.sse41.pminuw";
89 if(type.width == 16 && type.sign)
90 intrinsic = "llvm.x86.sse2.pmins.w";
91 if(type.width == 32 && !type.sign)
92 intrinsic = "llvm.x86.sse41.pminud";
93 if(type.width == 32 && type.sign)
94 intrinsic = "llvm.x86.sse41.pminsd";
95 }
96 }
97 #endif
98
99 if(intrinsic)
100 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
101
102 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
103 return lp_build_select(bld, cond, a, b);
104 }
105
106
107 /**
108 * Generate max(a, b)
109 * No checks for special case values of a or b = 1 or 0 are done.
110 */
111 static LLVMValueRef
112 lp_build_max_simple(struct lp_build_context *bld,
113 LLVMValueRef a,
114 LLVMValueRef b)
115 {
116 const union lp_type type = bld->type;
117 const char *intrinsic = NULL;
118 LLVMValueRef cond;
119
120 /* TODO: optimize the constant case */
121
122 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
123 if(type.width * type.length == 128) {
124 if(type.floating) {
125 if(type.width == 32)
126 intrinsic = "llvm.x86.sse.max.ps";
127 if(type.width == 64)
128 intrinsic = "llvm.x86.sse2.max.pd";
129 }
130 else {
131 if(type.width == 8 && !type.sign)
132 intrinsic = "llvm.x86.sse2.pmaxu.b";
133 if(type.width == 8 && type.sign)
134 intrinsic = "llvm.x86.sse41.pmaxsb";
135 if(type.width == 16 && !type.sign)
136 intrinsic = "llvm.x86.sse41.pmaxuw";
137 if(type.width == 16 && type.sign)
138 intrinsic = "llvm.x86.sse2.pmaxs.w";
139 if(type.width == 32 && !type.sign)
140 intrinsic = "llvm.x86.sse41.pmaxud";
141 if(type.width == 32 && type.sign)
142 intrinsic = "llvm.x86.sse41.pmaxsd";
143 }
144 }
145 #endif
146
147 if(intrinsic)
148 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
149
150 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
151 return lp_build_select(bld, cond, a, b);
152 }
153
154
155 /**
156 * Generate 1 - a, or ~a depending on bld->type.
157 */
158 LLVMValueRef
159 lp_build_comp(struct lp_build_context *bld,
160 LLVMValueRef a)
161 {
162 const union lp_type type = bld->type;
163
164 if(a == bld->one)
165 return bld->zero;
166 if(a == bld->zero)
167 return bld->one;
168
169 if(type.norm && !type.floating && !type.fixed && !type.sign) {
170 if(LLVMIsConstant(a))
171 return LLVMConstNot(a);
172 else
173 return LLVMBuildNot(bld->builder, a, "");
174 }
175
176 if(LLVMIsConstant(a))
177 return LLVMConstSub(bld->one, a);
178 else
179 return LLVMBuildSub(bld->builder, bld->one, a, "");
180 }
181
182
183 /**
184 * Generate a + b
185 */
186 LLVMValueRef
187 lp_build_add(struct lp_build_context *bld,
188 LLVMValueRef a,
189 LLVMValueRef b)
190 {
191 const union lp_type type = bld->type;
192 LLVMValueRef res;
193
194 if(a == bld->zero)
195 return b;
196 if(b == bld->zero)
197 return a;
198 if(a == bld->undef || b == bld->undef)
199 return bld->undef;
200
201 if(bld->type.norm) {
202 const char *intrinsic = NULL;
203
204 if(a == bld->one || b == bld->one)
205 return bld->one;
206
207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
208 if(type.width * type.length == 128 &&
209 !type.floating && !type.fixed) {
210 if(type.width == 8)
211 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
212 if(type.width == 16)
213 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
214 }
215 #endif
216
217 if(intrinsic)
218 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
219 }
220
221 if(LLVMIsConstant(a) && LLVMIsConstant(b))
222 res = LLVMConstAdd(a, b);
223 else
224 res = LLVMBuildAdd(bld->builder, a, b, "");
225
226 /* clamp to ceiling of 1.0 */
227 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
228 res = lp_build_min_simple(bld, res, bld->one);
229
230 /* XXX clamp to floor of -1 or 0??? */
231
232 return res;
233 }
234
235
236 /**
237 * Generate a - b
238 */
239 LLVMValueRef
240 lp_build_sub(struct lp_build_context *bld,
241 LLVMValueRef a,
242 LLVMValueRef b)
243 {
244 const union lp_type type = bld->type;
245 LLVMValueRef res;
246
247 if(b == bld->zero)
248 return a;
249 if(a == bld->undef || b == bld->undef)
250 return bld->undef;
251 if(a == b)
252 return bld->zero;
253
254 if(bld->type.norm) {
255 const char *intrinsic = NULL;
256
257 if(b == bld->one)
258 return bld->zero;
259
260 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
261 if(type.width * type.length == 128 &&
262 !type.floating && !type.fixed) {
263 if(type.width == 8)
264 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
265 if(type.width == 16)
266 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
267 }
268 #endif
269
270 if(intrinsic)
271 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
272 }
273
274 if(LLVMIsConstant(a) && LLVMIsConstant(b))
275 res = LLVMConstSub(a, b);
276 else
277 res = LLVMBuildSub(bld->builder, a, b, "");
278
279 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
280 res = lp_build_max_simple(bld, res, bld->zero);
281
282 return res;
283 }
284
285
286 /**
287 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
288 */
289 static LLVMValueRef
290 lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
291 {
292 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
293 unsigned i, j;
294
295 assert(n <= LP_MAX_VECTOR_LENGTH);
296 assert(lo_hi < 2);
297
298 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
299 elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
300 elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
301 }
302
303 return LLVMConstVector(elems, n);
304 }
305
306
307 /**
308 * Build constant int vector of width 'n' and value 'c'.
309 */
310 static LLVMValueRef
311 lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
312 {
313 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
314 unsigned i;
315
316 assert(n <= LP_MAX_VECTOR_LENGTH);
317
318 for(i = 0; i < n; ++i)
319 elems[i] = LLVMConstInt(type, c, 0);
320
321 return LLVMConstVector(elems, n);
322 }
323
324
325 /**
326 * Normalized 8bit multiplication.
327 *
328 * - alpha plus one
329 *
330 * makes the following approximation to the division (Sree)
331 *
332 * a*b/255 ~= (a*(b + 1)) >> 256
333 *
334 * which is the fastest method that satisfies the following OpenGL criteria
335 *
336 * 0*0 = 0 and 255*255 = 255
337 *
338 * - geometric series
339 *
340 * takes the geometric series approximation to the division
341 *
342 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
343 *
344 * in this case just the first two terms to fit in 16bit arithmetic
345 *
346 * t/255 ~= (t + (t >> 8)) >> 8
347 *
348 * note that just by itself it doesn't satisfies the OpenGL criteria, as
349 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
350 * must be used
351 *
352 * - geometric series plus rounding
353 *
354 * when using a geometric series division instead of truncating the result
355 * use roundoff in the approximation (Jim Blinn)
356 *
357 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
358 *
359 * achieving the exact results
360 *
361 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
362 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
363 * @sa Michael Herf, The "double blend trick", May 2000,
364 * http://www.stereopsis.com/doubleblend.html
365 */
366 static LLVMValueRef
367 lp_build_mul_u8n(LLVMBuilderRef builder,
368 LLVMValueRef a, LLVMValueRef b)
369 {
370 static LLVMValueRef c01 = NULL;
371 static LLVMValueRef c08 = NULL;
372 static LLVMValueRef c80 = NULL;
373 LLVMValueRef ab;
374
375 if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
376 if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
377 if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
378
379 #if 0
380
381 /* a*b/255 ~= (a*(b + 1)) >> 256 */
382 b = LLVMBuildAdd(builder, b, c01, "");
383 ab = LLVMBuildMul(builder, a, b, "");
384
385 #else
386
387 /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
388 ab = LLVMBuildMul(builder, a, b, "");
389 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
390 ab = LLVMBuildAdd(builder, ab, c80, "");
391
392 #endif
393
394 ab = LLVMBuildLShr(builder, ab, c08, "");
395
396 return ab;
397 }
398
399
400 /**
401 * Generate a * b
402 */
403 LLVMValueRef
404 lp_build_mul(struct lp_build_context *bld,
405 LLVMValueRef a,
406 LLVMValueRef b)
407 {
408 const union lp_type type = bld->type;
409
410 if(a == bld->zero)
411 return bld->zero;
412 if(a == bld->one)
413 return b;
414 if(b == bld->zero)
415 return bld->zero;
416 if(b == bld->one)
417 return a;
418 if(a == bld->undef || b == bld->undef)
419 return bld->undef;
420
421 if(!type.floating && !type.fixed && type.norm) {
422 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
423 if(type.width == 8 && type.length == 16) {
424 LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
425 LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
426 static LLVMValueRef ml = NULL;
427 static LLVMValueRef mh = NULL;
428 LLVMValueRef al, ah, bl, bh;
429 LLVMValueRef abl, abh;
430 LLVMValueRef ab;
431
432 if(!ml) ml = lp_build_unpack_shuffle(16, 0);
433 if(!mh) mh = lp_build_unpack_shuffle(16, 1);
434
435 /* PUNPCKLBW, PUNPCKHBW */
436 al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
437 bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
438 ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
439 bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
440
441 /* NOP */
442 al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
443 bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
444 ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
445 bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
446
447 /* PMULLW, PSRLW, PADDW */
448 abl = lp_build_mul_u8n(bld->builder, al, bl);
449 abh = lp_build_mul_u8n(bld->builder, ah, bh);
450
451 /* PACKUSWB */
452 ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
453
454 /* NOP */
455 ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
456
457 return ab;
458 }
459 #endif
460
461 /* FIXME */
462 assert(0);
463 }
464
465 if(LLVMIsConstant(a) && LLVMIsConstant(b))
466 return LLVMConstMul(a, b);
467
468 return LLVMBuildMul(bld->builder, a, b, "");
469 }
470
471
472 /**
473 * Generate a / b
474 */
475 LLVMValueRef
476 lp_build_div(struct lp_build_context *bld,
477 LLVMValueRef a,
478 LLVMValueRef b)
479 {
480 const union lp_type type = bld->type;
481
482 if(a == bld->zero)
483 return bld->zero;
484 if(a == bld->one)
485 return lp_build_rcp(bld, b);
486 if(b == bld->zero)
487 return bld->undef;
488 if(b == bld->one)
489 return a;
490 if(a == bld->undef || b == bld->undef)
491 return bld->undef;
492
493 if(LLVMIsConstant(a) && LLVMIsConstant(b))
494 return LLVMConstFDiv(a, b);
495
496 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
497 if(type.width == 32 && type.length == 4)
498 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
499 #endif
500
501 return LLVMBuildFDiv(bld->builder, a, b, "");
502 }
503
504
505 LLVMValueRef
506 lp_build_lerp(struct lp_build_context *bld,
507 LLVMValueRef x,
508 LLVMValueRef v0,
509 LLVMValueRef v1)
510 {
511 return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
512 }
513
514
515 LLVMValueRef
516 lp_build_lerp_2d(struct lp_build_context *bld,
517 LLVMValueRef x,
518 LLVMValueRef y,
519 LLVMValueRef v00,
520 LLVMValueRef v01,
521 LLVMValueRef v10,
522 LLVMValueRef v11)
523 {
524 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
525 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
526 return lp_build_lerp(bld, y, v0, v1);
527 }
528
529
530 /**
531 * Generate min(a, b)
532 * Do checks for special cases.
533 */
534 LLVMValueRef
535 lp_build_min(struct lp_build_context *bld,
536 LLVMValueRef a,
537 LLVMValueRef b)
538 {
539 if(a == bld->undef || b == bld->undef)
540 return bld->undef;
541
542 if(a == b)
543 return a;
544
545 if(bld->type.norm) {
546 if(a == bld->zero || b == bld->zero)
547 return bld->zero;
548 if(a == bld->one)
549 return b;
550 if(b == bld->one)
551 return a;
552 }
553
554 return lp_build_min_simple(bld, a, b);
555 }
556
557
558 /**
559 * Generate max(a, b)
560 * Do checks for special cases.
561 */
562 LLVMValueRef
563 lp_build_max(struct lp_build_context *bld,
564 LLVMValueRef a,
565 LLVMValueRef b)
566 {
567 if(a == bld->undef || b == bld->undef)
568 return bld->undef;
569
570 if(a == b)
571 return a;
572
573 if(bld->type.norm) {
574 if(a == bld->one || b == bld->one)
575 return bld->one;
576 if(a == bld->zero)
577 return b;
578 if(b == bld->zero)
579 return a;
580 }
581
582 return lp_build_max_simple(bld, a, b);
583 }
584
585
586 /**
587 * Generate abs(a)
588 */
589 LLVMValueRef
590 lp_build_abs(struct lp_build_context *bld,
591 LLVMValueRef a)
592 {
593 const union lp_type type = bld->type;
594 LLVMTypeRef vec_type = lp_build_vec_type(type);
595
596 if(!type.sign)
597 return a;
598
599 if(type.floating) {
600 /* Mask out the sign bit */
601 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
602 LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
603 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
604 a = LLVMBuildAnd(bld->builder, a, mask, "");
605 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
606 return a;
607 }
608
609 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
610 if(type.width*type.length == 128) {
611 switch(type.width) {
612 case 8:
613 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
614 case 16:
615 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
616 case 32:
617 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
618 }
619 }
620 #endif
621
622 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
623 }
624
625
626 enum lp_build_round_sse41_mode
627 {
628 LP_BUILD_ROUND_SSE41_NEAREST = 0,
629 LP_BUILD_ROUND_SSE41_FLOOR = 1,
630 LP_BUILD_ROUND_SSE41_CEIL = 2,
631 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
632 };
633
634
635 static INLINE LLVMValueRef
636 lp_build_round_sse41(struct lp_build_context *bld,
637 LLVMValueRef a,
638 enum lp_build_round_sse41_mode mode)
639 {
640 const union lp_type type = bld->type;
641 LLVMTypeRef vec_type = lp_build_vec_type(type);
642 const char *intrinsic;
643
644 assert(type.floating);
645 assert(type.width*type.length == 128);
646
647 switch(type.width) {
648 case 32:
649 intrinsic = "llvm.x86.sse41.round.ps";
650 break;
651 case 64:
652 intrinsic = "llvm.x86.sse41.round.pd";
653 break;
654 default:
655 assert(0);
656 return bld->undef;
657 }
658
659 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
660 LLVMConstInt(LLVMInt32Type(), mode, 0));
661 }
662
663
664 LLVMValueRef
665 lp_build_floor(struct lp_build_context *bld,
666 LLVMValueRef a)
667 {
668 const union lp_type type = bld->type;
669
670 assert(type.floating);
671
672 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
673 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
674 #endif
675
676 /* FIXME */
677 assert(0);
678 return bld->undef;
679 }
680
681
682 /**
683 * Convert to integer, through whichever rounding method that's fastest,
684 * typically truncating to zero.
685 */
686 LLVMValueRef
687 lp_build_int(struct lp_build_context *bld,
688 LLVMValueRef a)
689 {
690 const union lp_type type = bld->type;
691 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
692
693 assert(type.floating);
694
695 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
696 }
697
698
699 LLVMValueRef
700 lp_build_ifloor(struct lp_build_context *bld,
701 LLVMValueRef a)
702 {
703 a = lp_build_floor(bld, a);
704 a = lp_build_int(bld, a);
705 return a;
706 }
707
708
709 LLVMValueRef
710 lp_build_sqrt(struct lp_build_context *bld,
711 LLVMValueRef a)
712 {
713 const union lp_type type = bld->type;
714 LLVMTypeRef vec_type = lp_build_vec_type(type);
715 char intrinsic[32];
716
717 /* TODO: optimize the constant case */
718 /* TODO: optimize the constant case */
719
720 assert(type.floating);
721 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
722
723 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
724 }
725
726
727 LLVMValueRef
728 lp_build_rcp(struct lp_build_context *bld,
729 LLVMValueRef a)
730 {
731 const union lp_type type = bld->type;
732
733 if(a == bld->zero)
734 return bld->undef;
735 if(a == bld->one)
736 return bld->one;
737 if(a == bld->undef)
738 return bld->undef;
739
740 assert(type.floating);
741
742 if(LLVMIsConstant(a))
743 return LLVMConstFDiv(bld->one, a);
744
745 /* XXX: is this really necessary? */
746 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
747 if(type.width == 32 && type.length == 4)
748 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
749 #endif
750
751 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
752 }
753
754
755 /**
756 * Generate 1/sqrt(a)
757 */
758 LLVMValueRef
759 lp_build_rsqrt(struct lp_build_context *bld,
760 LLVMValueRef a)
761 {
762 const union lp_type type = bld->type;
763
764 assert(type.floating);
765
766 /* XXX: is this really necessary? */
767 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
768 if(type.width == 32 && type.length == 4)
769 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
770 #endif
771
772 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
773 }
774
775
776 /**
777 * Generate cos(a)
778 */
779 LLVMValueRef
780 lp_build_cos(struct lp_build_context *bld,
781 LLVMValueRef a)
782 {
783 const union lp_type type = bld->type;
784 LLVMTypeRef vec_type = lp_build_vec_type(type);
785 char intrinsic[32];
786
787 /* TODO: optimize the constant case */
788
789 assert(type.floating);
790 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
791
792 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
793 }
794
795
796 /**
797 * Generate sin(a)
798 */
799 LLVMValueRef
800 lp_build_sin(struct lp_build_context *bld,
801 LLVMValueRef a)
802 {
803 const union lp_type type = bld->type;
804 LLVMTypeRef vec_type = lp_build_vec_type(type);
805 char intrinsic[32];
806
807 /* TODO: optimize the constant case */
808
809 assert(type.floating);
810 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
811
812 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
813 }
814
815
816 /**
817 * Generate pow(x, y)
818 */
819 LLVMValueRef
820 lp_build_pow(struct lp_build_context *bld,
821 LLVMValueRef x,
822 LLVMValueRef y)
823 {
824 /* TODO: optimize the constant case */
825 if(LLVMIsConstant(x) && LLVMIsConstant(y))
826 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
827
828 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
829 }
830
831
832 /**
833 * Generate exp(x)
834 */
835 LLVMValueRef
836 lp_build_exp(struct lp_build_context *bld,
837 LLVMValueRef x)
838 {
839 /* log2(e) = 1/log(2) */
840 LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
841
842 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
843 }
844
845
846 /**
847 * Generate log(x)
848 */
849 LLVMValueRef
850 lp_build_log(struct lp_build_context *bld,
851 LLVMValueRef x)
852 {
853 /* log(2) */
854 LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
855
856 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
857 }
858
859
860 #define EXP_POLY_DEGREE 3
861 #define LOG_POLY_DEGREE 5
862
863
864 /**
865 * Generate polynomial.
866 * Ex: x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
867 */
868 static LLVMValueRef
869 lp_build_polynomial(struct lp_build_context *bld,
870 LLVMValueRef x,
871 const double *coeffs,
872 unsigned num_coeffs)
873 {
874 const union lp_type type = bld->type;
875 LLVMValueRef res = NULL;
876 unsigned i;
877
878 /* TODO: optimize the constant case */
879 if(LLVMIsConstant(x))
880 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
881
882 for (i = num_coeffs; i--; ) {
883 LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
884 if(res)
885 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
886 else
887 res = coeff;
888 }
889
890 if(res)
891 return res;
892 else
893 return bld->undef;
894 }
895
896
897 /**
898 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
899 */
900 const double lp_build_exp2_polynomial[] = {
901 #if EXP_POLY_DEGREE == 5
902 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
903 #elif EXP_POLY_DEGREE == 4
904 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
905 #elif EXP_POLY_DEGREE == 3
906 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
907 #elif EXP_POLY_DEGREE == 2
908 1.0017247, 6.5763628e-1, 3.3718944e-1
909 #else
910 #error
911 #endif
912 };
913
914
915 void
916 lp_build_exp2_approx(struct lp_build_context *bld,
917 LLVMValueRef x,
918 LLVMValueRef *p_exp2_int_part,
919 LLVMValueRef *p_frac_part,
920 LLVMValueRef *p_exp2)
921 {
922 const union lp_type type = bld->type;
923 LLVMTypeRef vec_type = lp_build_vec_type(type);
924 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
925 LLVMValueRef ipart = NULL;
926 LLVMValueRef fpart = NULL;
927 LLVMValueRef expipart = NULL;
928 LLVMValueRef expfpart = NULL;
929 LLVMValueRef res = NULL;
930
931 if(p_exp2_int_part || p_frac_part || p_exp2) {
932 /* TODO: optimize the constant case */
933 if(LLVMIsConstant(x))
934 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
935
936 assert(type.floating && type.width == 32);
937
938 x = lp_build_min(bld, x, lp_build_const_scalar(type, 129.0));
939 x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
940
941 /* ipart = int(x - 0.5) */
942 ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
943 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
944
945 /* fpart = x - ipart */
946 fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
947 fpart = LLVMBuildSub(bld->builder, x, fpart, "");
948 }
949
950 if(p_exp2_int_part || p_exp2) {
951 /* expipart = (float) (1 << ipart) */
952 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
953 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
954 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
955 }
956
957 if(p_exp2) {
958 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
959 Elements(lp_build_exp2_polynomial));
960
961 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
962 }
963
964 if(p_exp2_int_part)
965 *p_exp2_int_part = expipart;
966
967 if(p_frac_part)
968 *p_frac_part = fpart;
969
970 if(p_exp2)
971 *p_exp2 = res;
972 }
973
974
975 LLVMValueRef
976 lp_build_exp2(struct lp_build_context *bld,
977 LLVMValueRef x)
978 {
979 LLVMValueRef res;
980 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
981 return res;
982 }
983
984
985 /**
986 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
987 * These coefficients can be generate with
988 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
989 */
990 const double lp_build_log2_polynomial[] = {
991 #if LOG_POLY_DEGREE == 6
992 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
993 #elif LOG_POLY_DEGREE == 5
994 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
995 #elif LOG_POLY_DEGREE == 4
996 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
997 #elif LOG_POLY_DEGREE == 3
998 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
999 #else
1000 #error
1001 #endif
1002 };
1003
1004
1005 /**
1006 * See http://www.devmaster.net/forums/showthread.php?p=43580
1007 */
1008 void
1009 lp_build_log2_approx(struct lp_build_context *bld,
1010 LLVMValueRef x,
1011 LLVMValueRef *p_exp,
1012 LLVMValueRef *p_floor_log2,
1013 LLVMValueRef *p_log2)
1014 {
1015 const union lp_type type = bld->type;
1016 LLVMTypeRef vec_type = lp_build_vec_type(type);
1017 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1018
1019 LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1020 LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1021 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1022
1023 LLVMValueRef i = NULL;
1024 LLVMValueRef exp = NULL;
1025 LLVMValueRef mant = NULL;
1026 LLVMValueRef logexp = NULL;
1027 LLVMValueRef logmant = NULL;
1028 LLVMValueRef res = NULL;
1029
1030 if(p_exp || p_floor_log2 || p_log2) {
1031 /* TODO: optimize the constant case */
1032 if(LLVMIsConstant(x))
1033 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
1034
1035 assert(type.floating && type.width == 32);
1036
1037 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1038
1039 /* exp = (float) exponent(x) */
1040 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1041 }
1042
1043 if(p_floor_log2 || p_log2) {
1044 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1045 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1046 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1047 }
1048
1049 if(p_log2) {
1050 /* mant = (float) mantissa(x) */
1051 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1052 mant = LLVMBuildOr(bld->builder, mant, one, "");
1053 mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
1054
1055 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1056 Elements(lp_build_log2_polynomial));
1057
1058 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1059 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
1060
1061 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1062 }
1063
1064 if(p_exp)
1065 *p_exp = exp;
1066
1067 if(p_floor_log2)
1068 *p_floor_log2 = logexp;
1069
1070 if(p_log2)
1071 *p_log2 = res;
1072 }
1073
1074
1075 LLVMValueRef
1076 lp_build_log2(struct lp_build_context *bld,
1077 LLVMValueRef x)
1078 {
1079 LLVMValueRef res;
1080 lp_build_log2_approx(bld, x, NULL, NULL, &res);
1081 return res;
1082 }