Merge branch 'mesa_7_6_branch'
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_string.h"
51
52 #include "lp_bld_type.h"
53 #include "lp_bld_const.h"
54 #include "lp_bld_intr.h"
55 #include "lp_bld_logic.h"
56 #include "lp_bld_arit.h"
57
58
59 /**
60 * Generate min(a, b)
61 * No checks for special case values of a or b = 1 or 0 are done.
62 */
63 static LLVMValueRef
64 lp_build_min_simple(struct lp_build_context *bld,
65 LLVMValueRef a,
66 LLVMValueRef b)
67 {
68 const struct lp_type type = bld->type;
69 const char *intrinsic = NULL;
70 LLVMValueRef cond;
71
72 /* TODO: optimize the constant case */
73
74 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
75 if(type.width * type.length == 128) {
76 if(type.floating) {
77 if(type.width == 32)
78 intrinsic = "llvm.x86.sse.min.ps";
79 if(type.width == 64)
80 intrinsic = "llvm.x86.sse2.min.pd";
81 }
82 else {
83 if(type.width == 8 && !type.sign)
84 intrinsic = "llvm.x86.sse2.pminu.b";
85 if(type.width == 8 && type.sign)
86 intrinsic = "llvm.x86.sse41.pminsb";
87 if(type.width == 16 && !type.sign)
88 intrinsic = "llvm.x86.sse41.pminuw";
89 if(type.width == 16 && type.sign)
90 intrinsic = "llvm.x86.sse2.pmins.w";
91 if(type.width == 32 && !type.sign)
92 intrinsic = "llvm.x86.sse41.pminud";
93 if(type.width == 32 && type.sign)
94 intrinsic = "llvm.x86.sse41.pminsd";
95 }
96 }
97 #endif
98
99 if(intrinsic)
100 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
101
102 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
103 return lp_build_select(bld, cond, a, b);
104 }
105
106
107 /**
108 * Generate max(a, b)
109 * No checks for special case values of a or b = 1 or 0 are done.
110 */
111 static LLVMValueRef
112 lp_build_max_simple(struct lp_build_context *bld,
113 LLVMValueRef a,
114 LLVMValueRef b)
115 {
116 const struct lp_type type = bld->type;
117 const char *intrinsic = NULL;
118 LLVMValueRef cond;
119
120 /* TODO: optimize the constant case */
121
122 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
123 if(type.width * type.length == 128) {
124 if(type.floating) {
125 if(type.width == 32)
126 intrinsic = "llvm.x86.sse.max.ps";
127 if(type.width == 64)
128 intrinsic = "llvm.x86.sse2.max.pd";
129 }
130 else {
131 if(type.width == 8 && !type.sign)
132 intrinsic = "llvm.x86.sse2.pmaxu.b";
133 if(type.width == 8 && type.sign)
134 intrinsic = "llvm.x86.sse41.pmaxsb";
135 if(type.width == 16 && !type.sign)
136 intrinsic = "llvm.x86.sse41.pmaxuw";
137 if(type.width == 16 && type.sign)
138 intrinsic = "llvm.x86.sse2.pmaxs.w";
139 if(type.width == 32 && !type.sign)
140 intrinsic = "llvm.x86.sse41.pmaxud";
141 if(type.width == 32 && type.sign)
142 intrinsic = "llvm.x86.sse41.pmaxsd";
143 }
144 }
145 #endif
146
147 if(intrinsic)
148 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
149
150 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
151 return lp_build_select(bld, cond, a, b);
152 }
153
154
155 /**
156 * Generate 1 - a, or ~a depending on bld->type.
157 */
158 LLVMValueRef
159 lp_build_comp(struct lp_build_context *bld,
160 LLVMValueRef a)
161 {
162 const struct lp_type type = bld->type;
163
164 if(a == bld->one)
165 return bld->zero;
166 if(a == bld->zero)
167 return bld->one;
168
169 if(type.norm && !type.floating && !type.fixed && !type.sign) {
170 if(LLVMIsConstant(a))
171 return LLVMConstNot(a);
172 else
173 return LLVMBuildNot(bld->builder, a, "");
174 }
175
176 if(LLVMIsConstant(a))
177 return LLVMConstSub(bld->one, a);
178 else
179 return LLVMBuildSub(bld->builder, bld->one, a, "");
180 }
181
182
183 /**
184 * Generate a + b
185 */
186 LLVMValueRef
187 lp_build_add(struct lp_build_context *bld,
188 LLVMValueRef a,
189 LLVMValueRef b)
190 {
191 const struct lp_type type = bld->type;
192 LLVMValueRef res;
193
194 if(a == bld->zero)
195 return b;
196 if(b == bld->zero)
197 return a;
198 if(a == bld->undef || b == bld->undef)
199 return bld->undef;
200
201 if(bld->type.norm) {
202 const char *intrinsic = NULL;
203
204 if(a == bld->one || b == bld->one)
205 return bld->one;
206
207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
208 if(type.width * type.length == 128 &&
209 !type.floating && !type.fixed) {
210 if(type.width == 8)
211 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
212 if(type.width == 16)
213 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
214 }
215 #endif
216
217 if(intrinsic)
218 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
219 }
220
221 if(LLVMIsConstant(a) && LLVMIsConstant(b))
222 res = LLVMConstAdd(a, b);
223 else
224 res = LLVMBuildAdd(bld->builder, a, b, "");
225
226 /* clamp to ceiling of 1.0 */
227 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
228 res = lp_build_min_simple(bld, res, bld->one);
229
230 /* XXX clamp to floor of -1 or 0??? */
231
232 return res;
233 }
234
235
236 /**
237 * Generate a - b
238 */
239 LLVMValueRef
240 lp_build_sub(struct lp_build_context *bld,
241 LLVMValueRef a,
242 LLVMValueRef b)
243 {
244 const struct lp_type type = bld->type;
245 LLVMValueRef res;
246
247 if(b == bld->zero)
248 return a;
249 if(a == bld->undef || b == bld->undef)
250 return bld->undef;
251 if(a == b)
252 return bld->zero;
253
254 if(bld->type.norm) {
255 const char *intrinsic = NULL;
256
257 if(b == bld->one)
258 return bld->zero;
259
260 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
261 if(type.width * type.length == 128 &&
262 !type.floating && !type.fixed) {
263 if(type.width == 8)
264 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
265 if(type.width == 16)
266 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
267 }
268 #endif
269
270 if(intrinsic)
271 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
272 }
273
274 if(LLVMIsConstant(a) && LLVMIsConstant(b))
275 res = LLVMConstSub(a, b);
276 else
277 res = LLVMBuildSub(bld->builder, a, b, "");
278
279 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
280 res = lp_build_max_simple(bld, res, bld->zero);
281
282 return res;
283 }
284
285
286 /**
287 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
288 */
289 static LLVMValueRef
290 lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
291 {
292 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
293 unsigned i, j;
294
295 assert(n <= LP_MAX_VECTOR_LENGTH);
296 assert(lo_hi < 2);
297
298 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
299 elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
300 elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
301 }
302
303 return LLVMConstVector(elems, n);
304 }
305
306
307 /**
308 * Build constant int vector of width 'n' and value 'c'.
309 */
310 static LLVMValueRef
311 lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
312 {
313 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
314 unsigned i;
315
316 assert(n <= LP_MAX_VECTOR_LENGTH);
317
318 for(i = 0; i < n; ++i)
319 elems[i] = LLVMConstInt(type, c, 0);
320
321 return LLVMConstVector(elems, n);
322 }
323
324
325 /**
326 * Normalized 8bit multiplication.
327 *
328 * - alpha plus one
329 *
330 * makes the following approximation to the division (Sree)
331 *
332 * a*b/255 ~= (a*(b + 1)) >> 256
333 *
334 * which is the fastest method that satisfies the following OpenGL criteria
335 *
336 * 0*0 = 0 and 255*255 = 255
337 *
338 * - geometric series
339 *
340 * takes the geometric series approximation to the division
341 *
342 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
343 *
344 * in this case just the first two terms to fit in 16bit arithmetic
345 *
346 * t/255 ~= (t + (t >> 8)) >> 8
347 *
348 * note that just by itself it doesn't satisfies the OpenGL criteria, as
349 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
350 * must be used
351 *
352 * - geometric series plus rounding
353 *
354 * when using a geometric series division instead of truncating the result
355 * use roundoff in the approximation (Jim Blinn)
356 *
357 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
358 *
359 * achieving the exact results
360 *
361 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
362 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
363 * @sa Michael Herf, The "double blend trick", May 2000,
364 * http://www.stereopsis.com/doubleblend.html
365 */
366 static LLVMValueRef
367 lp_build_mul_u8n(LLVMBuilderRef builder,
368 LLVMValueRef a, LLVMValueRef b)
369 {
370 static LLVMValueRef c01 = NULL;
371 static LLVMValueRef c08 = NULL;
372 static LLVMValueRef c80 = NULL;
373 LLVMValueRef ab;
374
375 if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
376 if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
377 if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
378
379 #if 0
380
381 /* a*b/255 ~= (a*(b + 1)) >> 256 */
382 b = LLVMBuildAdd(builder, b, c01, "");
383 ab = LLVMBuildMul(builder, a, b, "");
384
385 #else
386
387 /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
388 ab = LLVMBuildMul(builder, a, b, "");
389 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
390 ab = LLVMBuildAdd(builder, ab, c80, "");
391
392 #endif
393
394 ab = LLVMBuildLShr(builder, ab, c08, "");
395
396 return ab;
397 }
398
399
400 /**
401 * Generate a * b
402 */
403 LLVMValueRef
404 lp_build_mul(struct lp_build_context *bld,
405 LLVMValueRef a,
406 LLVMValueRef b)
407 {
408 const struct lp_type type = bld->type;
409
410 if(a == bld->zero)
411 return bld->zero;
412 if(a == bld->one)
413 return b;
414 if(b == bld->zero)
415 return bld->zero;
416 if(b == bld->one)
417 return a;
418 if(a == bld->undef || b == bld->undef)
419 return bld->undef;
420
421 if(!type.floating && !type.fixed && type.norm) {
422 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
423 if(type.width == 8 && type.length == 16) {
424 LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
425 LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
426 static LLVMValueRef ml = NULL;
427 static LLVMValueRef mh = NULL;
428 LLVMValueRef al, ah, bl, bh;
429 LLVMValueRef abl, abh;
430 LLVMValueRef ab;
431
432 if(!ml) ml = lp_build_unpack_shuffle(16, 0);
433 if(!mh) mh = lp_build_unpack_shuffle(16, 1);
434
435 /* PUNPCKLBW, PUNPCKHBW */
436 al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
437 bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
438 ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
439 bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
440
441 /* NOP */
442 al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
443 bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
444 ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
445 bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
446
447 /* PMULLW, PSRLW, PADDW */
448 abl = lp_build_mul_u8n(bld->builder, al, bl);
449 abh = lp_build_mul_u8n(bld->builder, ah, bh);
450
451 /* PACKUSWB */
452 ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
453
454 /* NOP */
455 ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
456
457 return ab;
458 }
459 #endif
460
461 /* FIXME */
462 assert(0);
463 }
464
465 if(LLVMIsConstant(a) && LLVMIsConstant(b))
466 return LLVMConstMul(a, b);
467
468 return LLVMBuildMul(bld->builder, a, b, "");
469 }
470
471
472 /**
473 * Generate a / b
474 */
475 LLVMValueRef
476 lp_build_div(struct lp_build_context *bld,
477 LLVMValueRef a,
478 LLVMValueRef b)
479 {
480 const struct lp_type type = bld->type;
481
482 if(a == bld->zero)
483 return bld->zero;
484 if(a == bld->one)
485 return lp_build_rcp(bld, b);
486 if(b == bld->zero)
487 return bld->undef;
488 if(b == bld->one)
489 return a;
490 if(a == bld->undef || b == bld->undef)
491 return bld->undef;
492
493 if(LLVMIsConstant(a) && LLVMIsConstant(b))
494 return LLVMConstFDiv(a, b);
495
496 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
497 if(type.width == 32 && type.length == 4)
498 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
499 #endif
500
501 return LLVMBuildFDiv(bld->builder, a, b, "");
502 }
503
504
505 LLVMValueRef
506 lp_build_lerp(struct lp_build_context *bld,
507 LLVMValueRef x,
508 LLVMValueRef v0,
509 LLVMValueRef v1)
510 {
511 return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
512 }
513
514
515 LLVMValueRef
516 lp_build_lerp_2d(struct lp_build_context *bld,
517 LLVMValueRef x,
518 LLVMValueRef y,
519 LLVMValueRef v00,
520 LLVMValueRef v01,
521 LLVMValueRef v10,
522 LLVMValueRef v11)
523 {
524 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
525 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
526 return lp_build_lerp(bld, y, v0, v1);
527 }
528
529
530 /**
531 * Generate min(a, b)
532 * Do checks for special cases.
533 */
534 LLVMValueRef
535 lp_build_min(struct lp_build_context *bld,
536 LLVMValueRef a,
537 LLVMValueRef b)
538 {
539 if(a == bld->undef || b == bld->undef)
540 return bld->undef;
541
542 if(a == b)
543 return a;
544
545 if(bld->type.norm) {
546 if(a == bld->zero || b == bld->zero)
547 return bld->zero;
548 if(a == bld->one)
549 return b;
550 if(b == bld->one)
551 return a;
552 }
553
554 return lp_build_min_simple(bld, a, b);
555 }
556
557
558 /**
559 * Generate max(a, b)
560 * Do checks for special cases.
561 */
562 LLVMValueRef
563 lp_build_max(struct lp_build_context *bld,
564 LLVMValueRef a,
565 LLVMValueRef b)
566 {
567 if(a == bld->undef || b == bld->undef)
568 return bld->undef;
569
570 if(a == b)
571 return a;
572
573 if(bld->type.norm) {
574 if(a == bld->one || b == bld->one)
575 return bld->one;
576 if(a == bld->zero)
577 return b;
578 if(b == bld->zero)
579 return a;
580 }
581
582 return lp_build_max_simple(bld, a, b);
583 }
584
585
586 /**
587 * Generate abs(a)
588 */
589 LLVMValueRef
590 lp_build_abs(struct lp_build_context *bld,
591 LLVMValueRef a)
592 {
593 const struct lp_type type = bld->type;
594 LLVMTypeRef vec_type = lp_build_vec_type(type);
595
596 if(!type.sign)
597 return a;
598
599 if(type.floating) {
600 /* Mask out the sign bit */
601 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
602 LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
603 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
604 a = LLVMBuildAnd(bld->builder, a, mask, "");
605 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
606 return a;
607 }
608
609 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
610 if(type.width*type.length == 128) {
611 switch(type.width) {
612 case 8:
613 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
614 case 16:
615 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
616 case 32:
617 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
618 }
619 }
620 #endif
621
622 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
623 }
624
625
626 LLVMValueRef
627 lp_build_sgn(struct lp_build_context *bld,
628 LLVMValueRef a)
629 {
630 const struct lp_type type = bld->type;
631 LLVMTypeRef vec_type = lp_build_vec_type(type);
632 LLVMValueRef cond;
633 LLVMValueRef res;
634
635 /* Handle non-zero case */
636 if(!type.sign) {
637 /* if not zero then sign must be positive */
638 res = bld->one;
639 }
640 else if(type.floating) {
641 /* Take the sign bit and add it to 1 constant */
642 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
643 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
644 LLVMValueRef sign;
645 LLVMValueRef one;
646 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
647 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
648 one = LLVMConstBitCast(bld->one, int_vec_type);
649 res = LLVMBuildOr(bld->builder, sign, one, "");
650 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
651 }
652 else
653 {
654 LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
655 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
656 res = lp_build_select(bld, cond, bld->one, minus_one);
657 }
658
659 /* Handle zero */
660 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
661 res = lp_build_select(bld, cond, bld->zero, bld->one);
662
663 return res;
664 }
665
666
667 enum lp_build_round_sse41_mode
668 {
669 LP_BUILD_ROUND_SSE41_NEAREST = 0,
670 LP_BUILD_ROUND_SSE41_FLOOR = 1,
671 LP_BUILD_ROUND_SSE41_CEIL = 2,
672 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
673 };
674
675
676 static INLINE LLVMValueRef
677 lp_build_round_sse41(struct lp_build_context *bld,
678 LLVMValueRef a,
679 enum lp_build_round_sse41_mode mode)
680 {
681 const struct lp_type type = bld->type;
682 LLVMTypeRef vec_type = lp_build_vec_type(type);
683 const char *intrinsic;
684
685 assert(type.floating);
686 assert(type.width*type.length == 128);
687
688 switch(type.width) {
689 case 32:
690 intrinsic = "llvm.x86.sse41.round.ps";
691 break;
692 case 64:
693 intrinsic = "llvm.x86.sse41.round.pd";
694 break;
695 default:
696 assert(0);
697 return bld->undef;
698 }
699
700 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
701 LLVMConstInt(LLVMInt32Type(), mode, 0));
702 }
703
704
705 LLVMValueRef
706 lp_build_round(struct lp_build_context *bld,
707 LLVMValueRef a)
708 {
709 const struct lp_type type = bld->type;
710
711 assert(type.floating);
712
713 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
714 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
715 #endif
716
717 /* FIXME */
718 assert(0);
719 return bld->undef;
720 }
721
722
723 LLVMValueRef
724 lp_build_floor(struct lp_build_context *bld,
725 LLVMValueRef a)
726 {
727 const struct lp_type type = bld->type;
728
729 assert(type.floating);
730
731 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
732 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
733 #endif
734
735 /* FIXME */
736 assert(0);
737 return bld->undef;
738 }
739
740
741 LLVMValueRef
742 lp_build_ceil(struct lp_build_context *bld,
743 LLVMValueRef a)
744 {
745 const struct lp_type type = bld->type;
746
747 assert(type.floating);
748
749 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
750 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
751 #endif
752
753 /* FIXME */
754 assert(0);
755 return bld->undef;
756 }
757
758
759 LLVMValueRef
760 lp_build_trunc(struct lp_build_context *bld,
761 LLVMValueRef a)
762 {
763 const struct lp_type type = bld->type;
764
765 assert(type.floating);
766
767 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
768 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
769 #endif
770
771 /* FIXME */
772 assert(0);
773 return bld->undef;
774 }
775
776
777 /**
778 * Convert to integer, through whichever rounding method that's fastest,
779 * typically truncating to zero.
780 */
781 LLVMValueRef
782 lp_build_int(struct lp_build_context *bld,
783 LLVMValueRef a)
784 {
785 const struct lp_type type = bld->type;
786 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
787
788 assert(type.floating);
789
790 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
791 }
792
793
794 LLVMValueRef
795 lp_build_ifloor(struct lp_build_context *bld,
796 LLVMValueRef a)
797 {
798 a = lp_build_floor(bld, a);
799 a = lp_build_int(bld, a);
800 return a;
801 }
802
803
804 LLVMValueRef
805 lp_build_sqrt(struct lp_build_context *bld,
806 LLVMValueRef a)
807 {
808 const struct lp_type type = bld->type;
809 LLVMTypeRef vec_type = lp_build_vec_type(type);
810 char intrinsic[32];
811
812 /* TODO: optimize the constant case */
813 /* TODO: optimize the constant case */
814
815 assert(type.floating);
816 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
817
818 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
819 }
820
821
822 LLVMValueRef
823 lp_build_rcp(struct lp_build_context *bld,
824 LLVMValueRef a)
825 {
826 const struct lp_type type = bld->type;
827
828 if(a == bld->zero)
829 return bld->undef;
830 if(a == bld->one)
831 return bld->one;
832 if(a == bld->undef)
833 return bld->undef;
834
835 assert(type.floating);
836
837 if(LLVMIsConstant(a))
838 return LLVMConstFDiv(bld->one, a);
839
840 /* XXX: is this really necessary? */
841 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
842 if(type.width == 32 && type.length == 4)
843 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
844 #endif
845
846 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
847 }
848
849
850 /**
851 * Generate 1/sqrt(a)
852 */
853 LLVMValueRef
854 lp_build_rsqrt(struct lp_build_context *bld,
855 LLVMValueRef a)
856 {
857 const struct lp_type type = bld->type;
858
859 assert(type.floating);
860
861 /* XXX: is this really necessary? */
862 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
863 if(type.width == 32 && type.length == 4)
864 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
865 #endif
866
867 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
868 }
869
870
871 /**
872 * Generate cos(a)
873 */
874 LLVMValueRef
875 lp_build_cos(struct lp_build_context *bld,
876 LLVMValueRef a)
877 {
878 const struct lp_type type = bld->type;
879 LLVMTypeRef vec_type = lp_build_vec_type(type);
880 char intrinsic[32];
881
882 /* TODO: optimize the constant case */
883
884 assert(type.floating);
885 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
886
887 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
888 }
889
890
891 /**
892 * Generate sin(a)
893 */
894 LLVMValueRef
895 lp_build_sin(struct lp_build_context *bld,
896 LLVMValueRef a)
897 {
898 const struct lp_type type = bld->type;
899 LLVMTypeRef vec_type = lp_build_vec_type(type);
900 char intrinsic[32];
901
902 /* TODO: optimize the constant case */
903
904 assert(type.floating);
905 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
906
907 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
908 }
909
910
911 /**
912 * Generate pow(x, y)
913 */
914 LLVMValueRef
915 lp_build_pow(struct lp_build_context *bld,
916 LLVMValueRef x,
917 LLVMValueRef y)
918 {
919 /* TODO: optimize the constant case */
920 if(LLVMIsConstant(x) && LLVMIsConstant(y))
921 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
922
923 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
924 }
925
926
927 /**
928 * Generate exp(x)
929 */
930 LLVMValueRef
931 lp_build_exp(struct lp_build_context *bld,
932 LLVMValueRef x)
933 {
934 /* log2(e) = 1/log(2) */
935 LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
936
937 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
938 }
939
940
941 /**
942 * Generate log(x)
943 */
944 LLVMValueRef
945 lp_build_log(struct lp_build_context *bld,
946 LLVMValueRef x)
947 {
948 /* log(2) */
949 LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
950
951 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
952 }
953
954
955 #define EXP_POLY_DEGREE 3
956 #define LOG_POLY_DEGREE 5
957
958
959 /**
960 * Generate polynomial.
961 * Ex: x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
962 */
963 static LLVMValueRef
964 lp_build_polynomial(struct lp_build_context *bld,
965 LLVMValueRef x,
966 const double *coeffs,
967 unsigned num_coeffs)
968 {
969 const struct lp_type type = bld->type;
970 LLVMValueRef res = NULL;
971 unsigned i;
972
973 /* TODO: optimize the constant case */
974 if(LLVMIsConstant(x))
975 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
976
977 for (i = num_coeffs; i--; ) {
978 LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
979 if(res)
980 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
981 else
982 res = coeff;
983 }
984
985 if(res)
986 return res;
987 else
988 return bld->undef;
989 }
990
991
992 /**
993 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
994 */
995 const double lp_build_exp2_polynomial[] = {
996 #if EXP_POLY_DEGREE == 5
997 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
998 #elif EXP_POLY_DEGREE == 4
999 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1000 #elif EXP_POLY_DEGREE == 3
1001 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1002 #elif EXP_POLY_DEGREE == 2
1003 1.0017247, 6.5763628e-1, 3.3718944e-1
1004 #else
1005 #error
1006 #endif
1007 };
1008
1009
1010 void
1011 lp_build_exp2_approx(struct lp_build_context *bld,
1012 LLVMValueRef x,
1013 LLVMValueRef *p_exp2_int_part,
1014 LLVMValueRef *p_frac_part,
1015 LLVMValueRef *p_exp2)
1016 {
1017 const struct lp_type type = bld->type;
1018 LLVMTypeRef vec_type = lp_build_vec_type(type);
1019 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1020 LLVMValueRef ipart = NULL;
1021 LLVMValueRef fpart = NULL;
1022 LLVMValueRef expipart = NULL;
1023 LLVMValueRef expfpart = NULL;
1024 LLVMValueRef res = NULL;
1025
1026 if(p_exp2_int_part || p_frac_part || p_exp2) {
1027 /* TODO: optimize the constant case */
1028 if(LLVMIsConstant(x))
1029 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
1030
1031 assert(type.floating && type.width == 32);
1032
1033 x = lp_build_min(bld, x, lp_build_const_scalar(type, 129.0));
1034 x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
1035
1036 /* ipart = int(x - 0.5) */
1037 ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
1038 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1039
1040 /* fpart = x - ipart */
1041 fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1042 fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1043 }
1044
1045 if(p_exp2_int_part || p_exp2) {
1046 /* expipart = (float) (1 << ipart) */
1047 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
1048 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
1049 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1050 }
1051
1052 if(p_exp2) {
1053 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1054 Elements(lp_build_exp2_polynomial));
1055
1056 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1057 }
1058
1059 if(p_exp2_int_part)
1060 *p_exp2_int_part = expipart;
1061
1062 if(p_frac_part)
1063 *p_frac_part = fpart;
1064
1065 if(p_exp2)
1066 *p_exp2 = res;
1067 }
1068
1069
1070 LLVMValueRef
1071 lp_build_exp2(struct lp_build_context *bld,
1072 LLVMValueRef x)
1073 {
1074 LLVMValueRef res;
1075 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1076 return res;
1077 }
1078
1079
1080 /**
1081 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1082 * These coefficients can be generate with
1083 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1084 */
1085 const double lp_build_log2_polynomial[] = {
1086 #if LOG_POLY_DEGREE == 6
1087 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1088 #elif LOG_POLY_DEGREE == 5
1089 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1090 #elif LOG_POLY_DEGREE == 4
1091 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1092 #elif LOG_POLY_DEGREE == 3
1093 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1094 #else
1095 #error
1096 #endif
1097 };
1098
1099
1100 /**
1101 * See http://www.devmaster.net/forums/showthread.php?p=43580
1102 */
1103 void
1104 lp_build_log2_approx(struct lp_build_context *bld,
1105 LLVMValueRef x,
1106 LLVMValueRef *p_exp,
1107 LLVMValueRef *p_floor_log2,
1108 LLVMValueRef *p_log2)
1109 {
1110 const struct lp_type type = bld->type;
1111 LLVMTypeRef vec_type = lp_build_vec_type(type);
1112 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1113
1114 LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1115 LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1116 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1117
1118 LLVMValueRef i = NULL;
1119 LLVMValueRef exp = NULL;
1120 LLVMValueRef mant = NULL;
1121 LLVMValueRef logexp = NULL;
1122 LLVMValueRef logmant = NULL;
1123 LLVMValueRef res = NULL;
1124
1125 if(p_exp || p_floor_log2 || p_log2) {
1126 /* TODO: optimize the constant case */
1127 if(LLVMIsConstant(x))
1128 debug_printf("%s: inefficient/imprecise constant arithmetic\n");
1129
1130 assert(type.floating && type.width == 32);
1131
1132 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1133
1134 /* exp = (float) exponent(x) */
1135 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1136 }
1137
1138 if(p_floor_log2 || p_log2) {
1139 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1140 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1141 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1142 }
1143
1144 if(p_log2) {
1145 /* mant = (float) mantissa(x) */
1146 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1147 mant = LLVMBuildOr(bld->builder, mant, one, "");
1148 mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
1149
1150 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1151 Elements(lp_build_log2_polynomial));
1152
1153 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1154 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
1155
1156 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1157 }
1158
1159 if(p_exp)
1160 *p_exp = exp;
1161
1162 if(p_floor_log2)
1163 *p_floor_log2 = logexp;
1164
1165 if(p_log2)
1166 *p_log2 = res;
1167 }
1168
1169
1170 LLVMValueRef
1171 lp_build_log2(struct lp_build_context *bld,
1172 LLVMValueRef x)
1173 {
1174 LLVMValueRef res;
1175 lp_build_log2_approx(bld, x, NULL, NULL, &res);
1176 return res;
1177 }