gallivm: The the JIT engine to use our sinf()/cosf() on Windows.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_init.h" /* for lp_build_engine */
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
62
63
64 /**
65 * Generate min(a, b)
66 * No checks for special case values of a or b = 1 or 0 are done.
67 */
68 static LLVMValueRef
69 lp_build_min_simple(struct lp_build_context *bld,
70 LLVMValueRef a,
71 LLVMValueRef b)
72 {
73 const struct lp_type type = bld->type;
74 const char *intrinsic = NULL;
75 LLVMValueRef cond;
76
77 /* TODO: optimize the constant case */
78
79 if(type.width * type.length == 128) {
80 if(type.floating) {
81 if(type.width == 32 && util_cpu_caps.has_sse)
82 intrinsic = "llvm.x86.sse.min.ps";
83 if(type.width == 64 && util_cpu_caps.has_sse2)
84 intrinsic = "llvm.x86.sse2.min.pd";
85 }
86 else {
87 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
88 intrinsic = "llvm.x86.sse2.pminu.b";
89 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
90 intrinsic = "llvm.x86.sse41.pminsb";
91 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
92 intrinsic = "llvm.x86.sse41.pminuw";
93 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
94 intrinsic = "llvm.x86.sse2.pmins.w";
95 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
96 intrinsic = "llvm.x86.sse41.pminud";
97 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
98 intrinsic = "llvm.x86.sse41.pminsd";
99 }
100 }
101
102 if(intrinsic)
103 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
104
105 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
106 return lp_build_select(bld, cond, a, b);
107 }
108
109
110 /**
111 * Generate max(a, b)
112 * No checks for special case values of a or b = 1 or 0 are done.
113 */
114 static LLVMValueRef
115 lp_build_max_simple(struct lp_build_context *bld,
116 LLVMValueRef a,
117 LLVMValueRef b)
118 {
119 const struct lp_type type = bld->type;
120 const char *intrinsic = NULL;
121 LLVMValueRef cond;
122
123 /* TODO: optimize the constant case */
124
125 if(type.width * type.length == 128) {
126 if(type.floating) {
127 if(type.width == 32 && util_cpu_caps.has_sse)
128 intrinsic = "llvm.x86.sse.max.ps";
129 if(type.width == 64 && util_cpu_caps.has_sse2)
130 intrinsic = "llvm.x86.sse2.max.pd";
131 }
132 else {
133 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
134 intrinsic = "llvm.x86.sse2.pmaxu.b";
135 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
136 intrinsic = "llvm.x86.sse41.pmaxsb";
137 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
138 intrinsic = "llvm.x86.sse41.pmaxuw";
139 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
140 intrinsic = "llvm.x86.sse2.pmaxs.w";
141 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
142 intrinsic = "llvm.x86.sse41.pmaxud";
143 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
144 intrinsic = "llvm.x86.sse41.pmaxsd";
145 }
146 }
147
148 if(intrinsic)
149 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
150
151 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
152 return lp_build_select(bld, cond, a, b);
153 }
154
155
156 /**
157 * Generate 1 - a, or ~a depending on bld->type.
158 */
159 LLVMValueRef
160 lp_build_comp(struct lp_build_context *bld,
161 LLVMValueRef a)
162 {
163 const struct lp_type type = bld->type;
164
165 if(a == bld->one)
166 return bld->zero;
167 if(a == bld->zero)
168 return bld->one;
169
170 if(type.norm && !type.floating && !type.fixed && !type.sign) {
171 if(LLVMIsConstant(a))
172 return LLVMConstNot(a);
173 else
174 return LLVMBuildNot(bld->builder, a, "");
175 }
176
177 if(LLVMIsConstant(a))
178 return LLVMConstSub(bld->one, a);
179 else
180 return LLVMBuildSub(bld->builder, bld->one, a, "");
181 }
182
183
184 /**
185 * Generate a + b
186 */
187 LLVMValueRef
188 lp_build_add(struct lp_build_context *bld,
189 LLVMValueRef a,
190 LLVMValueRef b)
191 {
192 const struct lp_type type = bld->type;
193 LLVMValueRef res;
194
195 if(a == bld->zero)
196 return b;
197 if(b == bld->zero)
198 return a;
199 if(a == bld->undef || b == bld->undef)
200 return bld->undef;
201
202 if(bld->type.norm) {
203 const char *intrinsic = NULL;
204
205 if(a == bld->one || b == bld->one)
206 return bld->one;
207
208 if(util_cpu_caps.has_sse2 &&
209 type.width * type.length == 128 &&
210 !type.floating && !type.fixed) {
211 if(type.width == 8)
212 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
213 if(type.width == 16)
214 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
215 }
216
217 if(intrinsic)
218 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
219 }
220
221 if(LLVMIsConstant(a) && LLVMIsConstant(b))
222 res = LLVMConstAdd(a, b);
223 else
224 res = LLVMBuildAdd(bld->builder, a, b, "");
225
226 /* clamp to ceiling of 1.0 */
227 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
228 res = lp_build_min_simple(bld, res, bld->one);
229
230 /* XXX clamp to floor of -1 or 0??? */
231
232 return res;
233 }
234
235
236 /** Return the sum of the elements of a */
237 LLVMValueRef
238 lp_build_sum_vector(struct lp_build_context *bld,
239 LLVMValueRef a)
240 {
241 const struct lp_type type = bld->type;
242 LLVMValueRef index, res;
243 int i;
244
245 if (a == bld->zero)
246 return bld->zero;
247 if (a == bld->undef)
248 return bld->undef;
249 assert(type.length > 1);
250
251 assert(!bld->type.norm);
252
253 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
254 res = LLVMBuildExtractElement(bld->builder, a, index, "");
255
256 for (i = 1; i < type.length; i++) {
257 index = LLVMConstInt(LLVMInt32Type(), i, 0);
258 res = LLVMBuildAdd(bld->builder, res,
259 LLVMBuildExtractElement(bld->builder, a, index, ""),
260 "");
261 }
262
263 return res;
264 }
265
266
267 /**
268 * Generate a - b
269 */
270 LLVMValueRef
271 lp_build_sub(struct lp_build_context *bld,
272 LLVMValueRef a,
273 LLVMValueRef b)
274 {
275 const struct lp_type type = bld->type;
276 LLVMValueRef res;
277
278 if(b == bld->zero)
279 return a;
280 if(a == bld->undef || b == bld->undef)
281 return bld->undef;
282 if(a == b)
283 return bld->zero;
284
285 if(bld->type.norm) {
286 const char *intrinsic = NULL;
287
288 if(b == bld->one)
289 return bld->zero;
290
291 if(util_cpu_caps.has_sse2 &&
292 type.width * type.length == 128 &&
293 !type.floating && !type.fixed) {
294 if(type.width == 8)
295 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
296 if(type.width == 16)
297 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
298 }
299
300 if(intrinsic)
301 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
302 }
303
304 if(LLVMIsConstant(a) && LLVMIsConstant(b))
305 res = LLVMConstSub(a, b);
306 else
307 res = LLVMBuildSub(bld->builder, a, b, "");
308
309 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
310 res = lp_build_max_simple(bld, res, bld->zero);
311
312 return res;
313 }
314
315
316 /**
317 * Normalized 8bit multiplication.
318 *
319 * - alpha plus one
320 *
321 * makes the following approximation to the division (Sree)
322 *
323 * a*b/255 ~= (a*(b + 1)) >> 256
324 *
325 * which is the fastest method that satisfies the following OpenGL criteria
326 *
327 * 0*0 = 0 and 255*255 = 255
328 *
329 * - geometric series
330 *
331 * takes the geometric series approximation to the division
332 *
333 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
334 *
335 * in this case just the first two terms to fit in 16bit arithmetic
336 *
337 * t/255 ~= (t + (t >> 8)) >> 8
338 *
339 * note that just by itself it doesn't satisfies the OpenGL criteria, as
340 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
341 * must be used
342 *
343 * - geometric series plus rounding
344 *
345 * when using a geometric series division instead of truncating the result
346 * use roundoff in the approximation (Jim Blinn)
347 *
348 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
349 *
350 * achieving the exact results
351 *
352 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
353 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
354 * @sa Michael Herf, The "double blend trick", May 2000,
355 * http://www.stereopsis.com/doubleblend.html
356 */
357 static LLVMValueRef
358 lp_build_mul_u8n(LLVMBuilderRef builder,
359 struct lp_type i16_type,
360 LLVMValueRef a, LLVMValueRef b)
361 {
362 LLVMValueRef c8;
363 LLVMValueRef ab;
364
365 c8 = lp_build_const_int_vec(i16_type, 8);
366
367 #if 0
368
369 /* a*b/255 ~= (a*(b + 1)) >> 256 */
370 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
371 ab = LLVMBuildMul(builder, a, b, "");
372
373 #else
374
375 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
376 ab = LLVMBuildMul(builder, a, b, "");
377 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
378 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
379
380 #endif
381
382 ab = LLVMBuildLShr(builder, ab, c8, "");
383
384 return ab;
385 }
386
387
388 /**
389 * Generate a * b
390 */
391 LLVMValueRef
392 lp_build_mul(struct lp_build_context *bld,
393 LLVMValueRef a,
394 LLVMValueRef b)
395 {
396 const struct lp_type type = bld->type;
397 LLVMValueRef shift;
398 LLVMValueRef res;
399
400 if(a == bld->zero)
401 return bld->zero;
402 if(a == bld->one)
403 return b;
404 if(b == bld->zero)
405 return bld->zero;
406 if(b == bld->one)
407 return a;
408 if(a == bld->undef || b == bld->undef)
409 return bld->undef;
410
411 if(!type.floating && !type.fixed && type.norm) {
412 if(type.width == 8) {
413 struct lp_type i16_type = lp_wider_type(type);
414 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
415
416 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
417 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
418
419 /* PMULLW, PSRLW, PADDW */
420 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
421 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
422
423 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
424
425 return ab;
426 }
427
428 /* FIXME */
429 assert(0);
430 }
431
432 if(type.fixed)
433 shift = lp_build_const_int_vec(type, type.width/2);
434 else
435 shift = NULL;
436
437 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
438 res = LLVMConstMul(a, b);
439 if(shift) {
440 if(type.sign)
441 res = LLVMConstAShr(res, shift);
442 else
443 res = LLVMConstLShr(res, shift);
444 }
445 }
446 else {
447 res = LLVMBuildMul(bld->builder, a, b, "");
448 if(shift) {
449 if(type.sign)
450 res = LLVMBuildAShr(bld->builder, res, shift, "");
451 else
452 res = LLVMBuildLShr(bld->builder, res, shift, "");
453 }
454 }
455
456 return res;
457 }
458
459
460 /**
461 * Small vector x scale multiplication optimization.
462 */
463 LLVMValueRef
464 lp_build_mul_imm(struct lp_build_context *bld,
465 LLVMValueRef a,
466 int b)
467 {
468 LLVMValueRef factor;
469
470 if(b == 0)
471 return bld->zero;
472
473 if(b == 1)
474 return a;
475
476 if(b == -1)
477 return LLVMBuildNeg(bld->builder, a, "");
478
479 if(b == 2 && bld->type.floating)
480 return lp_build_add(bld, a, a);
481
482 if(util_is_pot(b)) {
483 unsigned shift = ffs(b) - 1;
484
485 if(bld->type.floating) {
486 #if 0
487 /*
488 * Power of two multiplication by directly manipulating the mantissa.
489 *
490 * XXX: This might not be always faster, it will introduce a small error
491 * for multiplication by zero, and it will produce wrong results
492 * for Inf and NaN.
493 */
494 unsigned mantissa = lp_mantissa(bld->type);
495 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
496 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
497 a = LLVMBuildAdd(bld->builder, a, factor, "");
498 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
499 return a;
500 #endif
501 }
502 else {
503 factor = lp_build_const_vec(bld->type, shift);
504 return LLVMBuildShl(bld->builder, a, factor, "");
505 }
506 }
507
508 factor = lp_build_const_vec(bld->type, (double)b);
509 return lp_build_mul(bld, a, factor);
510 }
511
512
513 /**
514 * Generate a / b
515 */
516 LLVMValueRef
517 lp_build_div(struct lp_build_context *bld,
518 LLVMValueRef a,
519 LLVMValueRef b)
520 {
521 const struct lp_type type = bld->type;
522
523 if(a == bld->zero)
524 return bld->zero;
525 if(a == bld->one)
526 return lp_build_rcp(bld, b);
527 if(b == bld->zero)
528 return bld->undef;
529 if(b == bld->one)
530 return a;
531 if(a == bld->undef || b == bld->undef)
532 return bld->undef;
533
534 if(LLVMIsConstant(a) && LLVMIsConstant(b))
535 return LLVMConstFDiv(a, b);
536
537 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
538 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
539
540 return LLVMBuildFDiv(bld->builder, a, b, "");
541 }
542
543
544 /**
545 * Linear interpolation.
546 *
547 * This also works for integer values with a few caveats.
548 *
549 * @sa http://www.stereopsis.com/doubleblend.html
550 */
551 LLVMValueRef
552 lp_build_lerp(struct lp_build_context *bld,
553 LLVMValueRef x,
554 LLVMValueRef v0,
555 LLVMValueRef v1)
556 {
557 LLVMValueRef delta;
558 LLVMValueRef res;
559
560 delta = lp_build_sub(bld, v1, v0);
561
562 res = lp_build_mul(bld, x, delta);
563
564 res = lp_build_add(bld, v0, res);
565
566 if(bld->type.fixed)
567 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
568 * but it will be wrong for other uses. Basically we need a more
569 * powerful lp_type, capable of further distinguishing the values
570 * interpretation from the value storage. */
571 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
572
573 return res;
574 }
575
576
577 LLVMValueRef
578 lp_build_lerp_2d(struct lp_build_context *bld,
579 LLVMValueRef x,
580 LLVMValueRef y,
581 LLVMValueRef v00,
582 LLVMValueRef v01,
583 LLVMValueRef v10,
584 LLVMValueRef v11)
585 {
586 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
587 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
588 return lp_build_lerp(bld, y, v0, v1);
589 }
590
591
592 /**
593 * Generate min(a, b)
594 * Do checks for special cases.
595 */
596 LLVMValueRef
597 lp_build_min(struct lp_build_context *bld,
598 LLVMValueRef a,
599 LLVMValueRef b)
600 {
601 if(a == bld->undef || b == bld->undef)
602 return bld->undef;
603
604 if(a == b)
605 return a;
606
607 if(bld->type.norm) {
608 if(a == bld->zero || b == bld->zero)
609 return bld->zero;
610 if(a == bld->one)
611 return b;
612 if(b == bld->one)
613 return a;
614 }
615
616 return lp_build_min_simple(bld, a, b);
617 }
618
619
620 /**
621 * Generate max(a, b)
622 * Do checks for special cases.
623 */
624 LLVMValueRef
625 lp_build_max(struct lp_build_context *bld,
626 LLVMValueRef a,
627 LLVMValueRef b)
628 {
629 if(a == bld->undef || b == bld->undef)
630 return bld->undef;
631
632 if(a == b)
633 return a;
634
635 if(bld->type.norm) {
636 if(a == bld->one || b == bld->one)
637 return bld->one;
638 if(a == bld->zero)
639 return b;
640 if(b == bld->zero)
641 return a;
642 }
643
644 return lp_build_max_simple(bld, a, b);
645 }
646
647
648 /**
649 * Generate clamp(a, min, max)
650 * Do checks for special cases.
651 */
652 LLVMValueRef
653 lp_build_clamp(struct lp_build_context *bld,
654 LLVMValueRef a,
655 LLVMValueRef min,
656 LLVMValueRef max)
657 {
658 a = lp_build_min(bld, a, max);
659 a = lp_build_max(bld, a, min);
660 return a;
661 }
662
663
664 /**
665 * Generate abs(a)
666 */
667 LLVMValueRef
668 lp_build_abs(struct lp_build_context *bld,
669 LLVMValueRef a)
670 {
671 const struct lp_type type = bld->type;
672 LLVMTypeRef vec_type = lp_build_vec_type(type);
673
674 if(!type.sign)
675 return a;
676
677 if(type.floating) {
678 /* Mask out the sign bit */
679 if (type.length == 1) {
680 LLVMTypeRef int_type = LLVMIntType(type.width);
681 LLVMTypeRef float_type = LLVMFloatType();
682 unsigned long long absMask = ~(1ULL << (type.width - 1));
683 LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
684 a = LLVMBuildBitCast(bld->builder, a, int_type, "");
685 a = LLVMBuildAnd(bld->builder, a, mask, "");
686 a = LLVMBuildBitCast(bld->builder, a, float_type, "");
687 return a;
688 }
689 else {
690 /* vector of floats */
691 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
692 unsigned long long absMask = ~(1ULL << (type.width - 1));
693 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
694 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
695 a = LLVMBuildAnd(bld->builder, a, mask, "");
696 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
697 return a;
698 }
699 }
700
701 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
702 switch(type.width) {
703 case 8:
704 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
705 case 16:
706 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
707 case 32:
708 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
709 }
710 }
711
712 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
713 }
714
715
716 LLVMValueRef
717 lp_build_negate(struct lp_build_context *bld,
718 LLVMValueRef a)
719 {
720 return LLVMBuildNeg(bld->builder, a, "");
721 }
722
723
724 /** Return -1, 0 or +1 depending on the sign of a */
725 LLVMValueRef
726 lp_build_sgn(struct lp_build_context *bld,
727 LLVMValueRef a)
728 {
729 const struct lp_type type = bld->type;
730 LLVMValueRef cond;
731 LLVMValueRef res;
732
733 /* Handle non-zero case */
734 if(!type.sign) {
735 /* if not zero then sign must be positive */
736 res = bld->one;
737 }
738 else if(type.floating) {
739 LLVMTypeRef vec_type;
740 LLVMTypeRef int_type;
741 LLVMValueRef mask;
742 LLVMValueRef sign;
743 LLVMValueRef one;
744 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
745
746 if (type.length == 1) {
747 int_type = lp_build_int_elem_type(type);
748 vec_type = lp_build_elem_type(type);
749 mask = LLVMConstInt(int_type, maskBit, 0);
750 }
751 else {
752 /* vector */
753 int_type = lp_build_int_vec_type(type);
754 vec_type = lp_build_vec_type(type);
755 mask = lp_build_const_int_vec(type, maskBit);
756 }
757
758 /* Take the sign bit and add it to 1 constant */
759 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
760 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
761 one = LLVMConstBitCast(bld->one, int_type);
762 res = LLVMBuildOr(bld->builder, sign, one, "");
763 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
764 }
765 else
766 {
767 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
768 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
769 res = lp_build_select(bld, cond, bld->one, minus_one);
770 }
771
772 /* Handle zero */
773 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
774 res = lp_build_select(bld, cond, bld->zero, res);
775
776 return res;
777 }
778
779
780 /**
781 * Set the sign of float vector 'a' according to 'sign'.
782 * If sign==0, return abs(a).
783 * If sign==1, return -abs(a);
784 * Other values for sign produce undefined results.
785 */
786 LLVMValueRef
787 lp_build_set_sign(struct lp_build_context *bld,
788 LLVMValueRef a, LLVMValueRef sign)
789 {
790 const struct lp_type type = bld->type;
791 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
792 LLVMTypeRef vec_type = lp_build_vec_type(type);
793 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
794 LLVMValueRef mask = lp_build_const_int_vec(type,
795 ~((unsigned long long) 1 << (type.width - 1)));
796 LLVMValueRef val, res;
797
798 assert(type.floating);
799
800 /* val = reinterpret_cast<int>(a) */
801 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
802 /* val = val & mask */
803 val = LLVMBuildAnd(bld->builder, val, mask, "");
804 /* sign = sign << shift */
805 sign = LLVMBuildShl(bld->builder, sign, shift, "");
806 /* res = val | sign */
807 res = LLVMBuildOr(bld->builder, val, sign, "");
808 /* res = reinterpret_cast<float>(res) */
809 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
810
811 return res;
812 }
813
814
815 /**
816 * Convert vector of (or scalar) int to vector of (or scalar) float.
817 */
818 LLVMValueRef
819 lp_build_int_to_float(struct lp_build_context *bld,
820 LLVMValueRef a)
821 {
822 const struct lp_type type = bld->type;
823
824 assert(type.floating);
825 /*assert(lp_check_value(type, a));*/
826
827 if (type.length == 1) {
828 LLVMTypeRef float_type = LLVMFloatType();
829 return LLVMBuildSIToFP(bld->builder, a, float_type, "");
830 }
831 else {
832 LLVMTypeRef vec_type = lp_build_vec_type(type);
833 /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
834 LLVMValueRef res;
835 res = LLVMBuildSIToFP(bld->builder, a, vec_type, "");
836 return res;
837 }
838 }
839
840
841
842 enum lp_build_round_sse41_mode
843 {
844 LP_BUILD_ROUND_SSE41_NEAREST = 0,
845 LP_BUILD_ROUND_SSE41_FLOOR = 1,
846 LP_BUILD_ROUND_SSE41_CEIL = 2,
847 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
848 };
849
850
851 static INLINE LLVMValueRef
852 lp_build_round_sse41(struct lp_build_context *bld,
853 LLVMValueRef a,
854 enum lp_build_round_sse41_mode mode)
855 {
856 const struct lp_type type = bld->type;
857 LLVMTypeRef vec_type = lp_build_vec_type(type);
858 const char *intrinsic;
859
860 assert(type.floating);
861 assert(type.width*type.length == 128);
862 assert(lp_check_value(type, a));
863 assert(util_cpu_caps.has_sse4_1);
864
865 switch(type.width) {
866 case 32:
867 intrinsic = "llvm.x86.sse41.round.ps";
868 break;
869 case 64:
870 intrinsic = "llvm.x86.sse41.round.pd";
871 break;
872 default:
873 assert(0);
874 return bld->undef;
875 }
876
877 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
878 LLVMConstInt(LLVMInt32Type(), mode, 0));
879 }
880
881
882 LLVMValueRef
883 lp_build_trunc(struct lp_build_context *bld,
884 LLVMValueRef a)
885 {
886 const struct lp_type type = bld->type;
887
888 assert(type.floating);
889 assert(lp_check_value(type, a));
890
891 if(util_cpu_caps.has_sse4_1)
892 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
893 else {
894 LLVMTypeRef vec_type = lp_build_vec_type(type);
895 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
896 LLVMValueRef res;
897 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
898 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
899 return res;
900 }
901 }
902
903
904 LLVMValueRef
905 lp_build_round(struct lp_build_context *bld,
906 LLVMValueRef a)
907 {
908 const struct lp_type type = bld->type;
909
910 assert(type.floating);
911 assert(lp_check_value(type, a));
912
913 if(util_cpu_caps.has_sse4_1)
914 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
915 else {
916 LLVMTypeRef vec_type = lp_build_vec_type(type);
917 LLVMValueRef res;
918 res = lp_build_iround(bld, a);
919 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
920 return res;
921 }
922 }
923
924
925 LLVMValueRef
926 lp_build_floor(struct lp_build_context *bld,
927 LLVMValueRef a)
928 {
929 const struct lp_type type = bld->type;
930
931 assert(type.floating);
932
933 if (type.length == 1) {
934 LLVMValueRef res;
935 res = lp_build_ifloor(bld, a);
936 res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), "");
937 return res;
938 }
939
940 if(util_cpu_caps.has_sse4_1)
941 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
942 else {
943 LLVMTypeRef vec_type = lp_build_vec_type(type);
944 LLVMValueRef res;
945 res = lp_build_ifloor(bld, a);
946 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
947 return res;
948 }
949 }
950
951
952 LLVMValueRef
953 lp_build_ceil(struct lp_build_context *bld,
954 LLVMValueRef a)
955 {
956 const struct lp_type type = bld->type;
957
958 assert(type.floating);
959 assert(lp_check_value(type, a));
960
961 if(util_cpu_caps.has_sse4_1)
962 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
963 else {
964 LLVMTypeRef vec_type = lp_build_vec_type(type);
965 LLVMValueRef res;
966 res = lp_build_iceil(bld, a);
967 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
968 return res;
969 }
970 }
971
972
973 /**
974 * Return fractional part of 'a' computed as a - floor(f)
975 * Typically used in texture coord arithmetic.
976 */
977 LLVMValueRef
978 lp_build_fract(struct lp_build_context *bld,
979 LLVMValueRef a)
980 {
981 assert(bld->type.floating);
982 return lp_build_sub(bld, a, lp_build_floor(bld, a));
983 }
984
985
986 /**
987 * Convert to integer, through whichever rounding method that's fastest,
988 * typically truncating toward zero.
989 */
990 LLVMValueRef
991 lp_build_itrunc(struct lp_build_context *bld,
992 LLVMValueRef a)
993 {
994 const struct lp_type type = bld->type;
995
996 assert(type.floating);
997
998 if (type.length == 1) {
999 LLVMTypeRef int_type = LLVMIntType(type.width);
1000 return LLVMBuildFPToSI(bld->builder, a, int_type, "");
1001 }
1002 else {
1003 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1004 assert(lp_check_value(type, a));
1005 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1006 }
1007 }
1008
1009
1010 /**
1011 * Convert float[] to int[] with round().
1012 */
1013 LLVMValueRef
1014 lp_build_iround(struct lp_build_context *bld,
1015 LLVMValueRef a)
1016 {
1017 const struct lp_type type = bld->type;
1018 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1019 LLVMValueRef res;
1020
1021 assert(type.floating);
1022
1023 if (type.length == 1) {
1024 /* scalar float to int */
1025 LLVMTypeRef int_type = LLVMIntType(type.width);
1026 /* XXX we want rounding here! */
1027 res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
1028 return res;
1029 }
1030
1031 assert(lp_check_value(type, a));
1032
1033 if(util_cpu_caps.has_sse4_1) {
1034 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1035 }
1036 else {
1037 LLVMTypeRef vec_type = lp_build_vec_type(type);
1038 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1039 LLVMValueRef sign;
1040 LLVMValueRef half;
1041
1042 /* get sign bit */
1043 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1044 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1045
1046 /* sign * 0.5 */
1047 half = lp_build_const_vec(type, 0.5);
1048 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1049 half = LLVMBuildOr(bld->builder, sign, half, "");
1050 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1051
1052 res = LLVMBuildAdd(bld->builder, a, half, "");
1053 }
1054
1055 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1056
1057 return res;
1058 }
1059
1060
1061 /**
1062 * Convert float[] to int[] with floor().
1063 */
1064 LLVMValueRef
1065 lp_build_ifloor(struct lp_build_context *bld,
1066 LLVMValueRef a)
1067 {
1068 const struct lp_type type = bld->type;
1069 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1070 LLVMValueRef res;
1071
1072 assert(type.floating);
1073
1074 if (type.length == 1) {
1075 /* scalar float to int */
1076 LLVMTypeRef int_type = LLVMIntType(type.width);
1077 res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
1078 return res;
1079 }
1080
1081 assert(lp_check_value(type, a));
1082
1083 if(util_cpu_caps.has_sse4_1) {
1084 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1085 }
1086 else {
1087 /* Take the sign bit and add it to 1 constant */
1088 LLVMTypeRef vec_type = lp_build_vec_type(type);
1089 unsigned mantissa = lp_mantissa(type);
1090 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1091 LLVMValueRef sign;
1092 LLVMValueRef offset;
1093
1094 /* sign = a < 0 ? ~0 : 0 */
1095 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1096 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1097 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
1098 lp_build_name(sign, "floor.sign");
1099
1100 /* offset = -0.99999(9)f */
1101 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
1102 offset = LLVMConstBitCast(offset, int_vec_type);
1103
1104 /* offset = a < 0 ? -0.99999(9)f : 0.0f */
1105 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1106 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
1107 lp_build_name(offset, "floor.offset");
1108
1109 res = LLVMBuildAdd(bld->builder, a, offset, "");
1110 lp_build_name(res, "floor.res");
1111 }
1112
1113 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1114 lp_build_name(res, "floor");
1115
1116 return res;
1117 }
1118
1119
1120 LLVMValueRef
1121 lp_build_iceil(struct lp_build_context *bld,
1122 LLVMValueRef a)
1123 {
1124 const struct lp_type type = bld->type;
1125 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1126 LLVMValueRef res;
1127
1128 assert(type.floating);
1129 assert(lp_check_value(type, a));
1130
1131 if(util_cpu_caps.has_sse4_1) {
1132 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1133 }
1134 else {
1135 assert(0);
1136 res = bld->undef;
1137 }
1138
1139 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1140
1141 return res;
1142 }
1143
1144
1145 LLVMValueRef
1146 lp_build_sqrt(struct lp_build_context *bld,
1147 LLVMValueRef a)
1148 {
1149 const struct lp_type type = bld->type;
1150 LLVMTypeRef vec_type = lp_build_vec_type(type);
1151 char intrinsic[32];
1152
1153 /* TODO: optimize the constant case */
1154 /* TODO: optimize the constant case */
1155
1156 assert(type.floating);
1157 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1158
1159 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1160 }
1161
1162
1163 LLVMValueRef
1164 lp_build_rcp(struct lp_build_context *bld,
1165 LLVMValueRef a)
1166 {
1167 const struct lp_type type = bld->type;
1168
1169 if(a == bld->zero)
1170 return bld->undef;
1171 if(a == bld->one)
1172 return bld->one;
1173 if(a == bld->undef)
1174 return bld->undef;
1175
1176 assert(type.floating);
1177
1178 if(LLVMIsConstant(a))
1179 return LLVMConstFDiv(bld->one, a);
1180
1181 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1182 /*
1183 * XXX: Added precision is not always necessary, so only enable this
1184 * when we have a better system in place to track minimum precision.
1185 */
1186
1187 #if 0
1188 /*
1189 * Do one Newton-Raphson step to improve precision:
1190 *
1191 * x1 = (2 - a * rcp(a)) * rcp(a)
1192 */
1193
1194 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1195 LLVMValueRef rcp_a;
1196 LLVMValueRef res;
1197
1198 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1199
1200 res = LLVMBuildMul(bld->builder, a, rcp_a, "");
1201 res = LLVMBuildSub(bld->builder, two, res, "");
1202 res = LLVMBuildMul(bld->builder, res, rcp_a, "");
1203
1204 return rcp_a;
1205 #else
1206 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1207 #endif
1208 }
1209
1210 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1211 }
1212
1213
1214 /**
1215 * Generate 1/sqrt(a)
1216 */
1217 LLVMValueRef
1218 lp_build_rsqrt(struct lp_build_context *bld,
1219 LLVMValueRef a)
1220 {
1221 const struct lp_type type = bld->type;
1222
1223 assert(type.floating);
1224
1225 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1226 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1227
1228 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1229 }
1230
1231
1232 #ifdef PIPE_OS_WINDOWS
1233
1234 /*
1235 * XXX: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
1236 * which is neither efficient nor does the CRT linkage work on Windows
1237 * causing segmentation fault.
1238 *
1239 * XXX: With LLVM 2.7 both schemes cause an assertion failure.
1240 */
1241 static LLVMValueRef
1242 lp_build_sincos(struct lp_build_context *bld,
1243 const char *name,
1244 float (*func)(float),
1245 LLVMValueRef a)
1246 {
1247 LLVMModuleRef module =
1248 LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld->builder)));
1249 LLVMValueRef function;
1250 LLVMValueRef res;
1251 unsigned i;
1252
1253 assert(bld->type.floating);
1254 assert(bld->type.width == 32);
1255
1256 function = LLVMGetNamedFunction(module, name);
1257 if (!function) {
1258 LLVMTypeRef ret_type;
1259 LLVMTypeRef arg_types[1];
1260 LLVMTypeRef function_type;
1261
1262 ret_type = LLVMFloatType();
1263 arg_types[0] = LLVMFloatType();
1264 function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
1265 function = LLVMAddFunction(module, name, function_type);
1266
1267 LLVMSetFunctionCallConv(function, LLVMCCallConv);
1268 LLVMSetLinkage(function, LLVMPrivateLinkage);
1269
1270 assert(LLVMIsDeclaration(function));
1271
1272 LLVMAddGlobalMapping(lp_build_engine, function, func);
1273 }
1274
1275 res = bld->undef;
1276
1277 for (i = 0; i < bld->type.length; ++i) {
1278 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
1279 LLVMValueRef args[1];
1280 LLVMValueRef tmp;
1281
1282 args[0] = LLVMBuildExtractElement(bld->builder, a, index, "");
1283
1284 tmp = LLVMBuildCall(bld->builder, function, args, Elements(args), "");
1285
1286 res = LLVMBuildInsertElement(bld->builder, res, tmp, index, "");
1287 }
1288
1289 return res;
1290 }
1291
1292 LLVMValueRef
1293 lp_build_cos(struct lp_build_context *bld,
1294 LLVMValueRef a)
1295 {
1296 return lp_build_sincos(bld, "cosf", &cosf, a);
1297 }
1298
1299 LLVMValueRef
1300 lp_build_sin(struct lp_build_context *bld,
1301 LLVMValueRef a)
1302 {
1303 return lp_build_sincos(bld, "sinf", &sinf, a);
1304 }
1305
1306 #else /* !PIPE_OS_WINDOWS */
1307
1308 /**
1309 * Generate cos(a)
1310 */
1311 LLVMValueRef
1312 lp_build_cos(struct lp_build_context *bld,
1313 LLVMValueRef a)
1314 {
1315 const struct lp_type type = bld->type;
1316 LLVMTypeRef vec_type = lp_build_vec_type(type);
1317 char intrinsic[32];
1318
1319 /* TODO: optimize the constant case */
1320
1321 assert(type.floating);
1322 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
1323
1324 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1325 }
1326
1327
1328 /**
1329 * Generate sin(a)
1330 */
1331 LLVMValueRef
1332 lp_build_sin(struct lp_build_context *bld,
1333 LLVMValueRef a)
1334 {
1335 const struct lp_type type = bld->type;
1336 LLVMTypeRef vec_type = lp_build_vec_type(type);
1337 char intrinsic[32];
1338
1339 /* TODO: optimize the constant case */
1340
1341 assert(type.floating);
1342 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1343
1344 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1345 }
1346
1347 #endif /* !PIPE_OS_WINDOWS */
1348
1349
1350 /**
1351 * Generate pow(x, y)
1352 */
1353 LLVMValueRef
1354 lp_build_pow(struct lp_build_context *bld,
1355 LLVMValueRef x,
1356 LLVMValueRef y)
1357 {
1358 /* TODO: optimize the constant case */
1359 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1360 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1361 __FUNCTION__);
1362
1363 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1364 }
1365
1366
1367 /**
1368 * Generate exp(x)
1369 */
1370 LLVMValueRef
1371 lp_build_exp(struct lp_build_context *bld,
1372 LLVMValueRef x)
1373 {
1374 /* log2(e) = 1/log(2) */
1375 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1376
1377 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1378 }
1379
1380
1381 /**
1382 * Generate log(x)
1383 */
1384 LLVMValueRef
1385 lp_build_log(struct lp_build_context *bld,
1386 LLVMValueRef x)
1387 {
1388 /* log(2) */
1389 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1390
1391 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1392 }
1393
1394
1395 #define EXP_POLY_DEGREE 3
1396 #define LOG_POLY_DEGREE 5
1397
1398
1399 /**
1400 * Generate polynomial.
1401 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1402 */
1403 static LLVMValueRef
1404 lp_build_polynomial(struct lp_build_context *bld,
1405 LLVMValueRef x,
1406 const double *coeffs,
1407 unsigned num_coeffs)
1408 {
1409 const struct lp_type type = bld->type;
1410 LLVMTypeRef float_type = LLVMFloatType();
1411 LLVMValueRef res = NULL;
1412 unsigned i;
1413
1414 /* TODO: optimize the constant case */
1415 if(LLVMIsConstant(x))
1416 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1417 __FUNCTION__);
1418
1419 for (i = num_coeffs; i--; ) {
1420 LLVMValueRef coeff;
1421
1422 if (type.length == 1)
1423 coeff = LLVMConstReal(float_type, coeffs[i]);
1424 else
1425 coeff = lp_build_const_vec(type, coeffs[i]);
1426
1427 if(res)
1428 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1429 else
1430 res = coeff;
1431 }
1432
1433 if(res)
1434 return res;
1435 else
1436 return bld->undef;
1437 }
1438
1439
1440 /**
1441 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
1442 */
1443 const double lp_build_exp2_polynomial[] = {
1444 #if EXP_POLY_DEGREE == 5
1445 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
1446 #elif EXP_POLY_DEGREE == 4
1447 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1448 #elif EXP_POLY_DEGREE == 3
1449 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1450 #elif EXP_POLY_DEGREE == 2
1451 1.0017247, 6.5763628e-1, 3.3718944e-1
1452 #else
1453 #error
1454 #endif
1455 };
1456
1457
1458 void
1459 lp_build_exp2_approx(struct lp_build_context *bld,
1460 LLVMValueRef x,
1461 LLVMValueRef *p_exp2_int_part,
1462 LLVMValueRef *p_frac_part,
1463 LLVMValueRef *p_exp2)
1464 {
1465 const struct lp_type type = bld->type;
1466 LLVMTypeRef vec_type = lp_build_vec_type(type);
1467 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1468 LLVMValueRef ipart = NULL;
1469 LLVMValueRef fpart = NULL;
1470 LLVMValueRef expipart = NULL;
1471 LLVMValueRef expfpart = NULL;
1472 LLVMValueRef res = NULL;
1473
1474 if(p_exp2_int_part || p_frac_part || p_exp2) {
1475 /* TODO: optimize the constant case */
1476 if(LLVMIsConstant(x))
1477 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1478 __FUNCTION__);
1479
1480 assert(type.floating && type.width == 32);
1481
1482 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1483 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1484
1485 /* ipart = int(x - 0.5) */
1486 ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), "");
1487 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1488
1489 /* fpart = x - ipart */
1490 fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1491 fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1492 }
1493
1494 if(p_exp2_int_part || p_exp2) {
1495 /* expipart = (float) (1 << ipart) */
1496 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1497 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1498 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1499 }
1500
1501 if(p_exp2) {
1502 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1503 Elements(lp_build_exp2_polynomial));
1504
1505 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1506 }
1507
1508 if(p_exp2_int_part)
1509 *p_exp2_int_part = expipart;
1510
1511 if(p_frac_part)
1512 *p_frac_part = fpart;
1513
1514 if(p_exp2)
1515 *p_exp2 = res;
1516 }
1517
1518
1519 LLVMValueRef
1520 lp_build_exp2(struct lp_build_context *bld,
1521 LLVMValueRef x)
1522 {
1523 LLVMValueRef res;
1524 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1525 return res;
1526 }
1527
1528
1529 /**
1530 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1531 * These coefficients can be generate with
1532 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1533 */
1534 const double lp_build_log2_polynomial[] = {
1535 #if LOG_POLY_DEGREE == 6
1536 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1537 #elif LOG_POLY_DEGREE == 5
1538 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1539 #elif LOG_POLY_DEGREE == 4
1540 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1541 #elif LOG_POLY_DEGREE == 3
1542 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1543 #else
1544 #error
1545 #endif
1546 };
1547
1548
1549 /**
1550 * See http://www.devmaster.net/forums/showthread.php?p=43580
1551 */
1552 void
1553 lp_build_log2_approx(struct lp_build_context *bld,
1554 LLVMValueRef x,
1555 LLVMValueRef *p_exp,
1556 LLVMValueRef *p_floor_log2,
1557 LLVMValueRef *p_log2)
1558 {
1559 const struct lp_type type = bld->type;
1560 LLVMTypeRef vec_type = lp_build_vec_type(type);
1561 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1562
1563 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
1564 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
1565 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1566
1567 LLVMValueRef i = NULL;
1568 LLVMValueRef exp = NULL;
1569 LLVMValueRef mant = NULL;
1570 LLVMValueRef logexp = NULL;
1571 LLVMValueRef logmant = NULL;
1572 LLVMValueRef res = NULL;
1573
1574 if(p_exp || p_floor_log2 || p_log2) {
1575 /* TODO: optimize the constant case */
1576 if(LLVMIsConstant(x))
1577 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1578 __FUNCTION__);
1579
1580 assert(type.floating && type.width == 32);
1581
1582 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1583
1584 /* exp = (float) exponent(x) */
1585 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1586 }
1587
1588 if(p_floor_log2 || p_log2) {
1589 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
1590 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
1591 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1592 }
1593
1594 if(p_log2) {
1595 /* mant = (float) mantissa(x) */
1596 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1597 mant = LLVMBuildOr(bld->builder, mant, one, "");
1598 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
1599
1600 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1601 Elements(lp_build_log2_polynomial));
1602
1603 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1604 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1605
1606 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1607 }
1608
1609 if(p_exp) {
1610 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
1611 *p_exp = exp;
1612 }
1613
1614 if(p_floor_log2)
1615 *p_floor_log2 = logexp;
1616
1617 if(p_log2)
1618 *p_log2 = res;
1619 }
1620
1621
1622 /** scalar version of above function */
1623 static void
1624 lp_build_float_log2_approx(struct lp_build_context *bld,
1625 LLVMValueRef x,
1626 LLVMValueRef *p_exp,
1627 LLVMValueRef *p_floor_log2,
1628 LLVMValueRef *p_log2)
1629 {
1630 const struct lp_type type = bld->type;
1631 LLVMTypeRef float_type = LLVMFloatType();
1632 LLVMTypeRef int_type = LLVMIntType(type.width);
1633
1634 LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
1635 LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
1636 LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
1637
1638 LLVMValueRef i = NULL;
1639 LLVMValueRef exp = NULL;
1640 LLVMValueRef mant = NULL;
1641 LLVMValueRef logexp = NULL;
1642 LLVMValueRef logmant = NULL;
1643 LLVMValueRef res = NULL;
1644
1645 if(p_exp || p_floor_log2 || p_log2) {
1646 /* TODO: optimize the constant case */
1647 if(LLVMIsConstant(x))
1648 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1649 __FUNCTION__);
1650
1651 assert(type.floating && type.width == 32);
1652
1653 i = LLVMBuildBitCast(bld->builder, x, int_type, "");
1654
1655 /* exp = (float) exponent(x) */
1656 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1657 }
1658
1659 if(p_floor_log2 || p_log2) {
1660 LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
1661 LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
1662 logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
1663 logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
1664 logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
1665 }
1666
1667 if(p_log2) {
1668 /* mant = (float) mantissa(x) */
1669 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1670 mant = LLVMBuildOr(bld->builder, mant, one, "");
1671 mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
1672
1673 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1674 Elements(lp_build_log2_polynomial));
1675
1676 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1677 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
1678
1679 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1680 }
1681
1682 if(p_exp) {
1683 exp = LLVMBuildBitCast(bld->builder, exp, float_type, "");
1684 *p_exp = exp;
1685 }
1686
1687 if(p_floor_log2)
1688 *p_floor_log2 = logexp;
1689
1690 if(p_log2)
1691 *p_log2 = res;
1692 }
1693
1694
1695 LLVMValueRef
1696 lp_build_log2(struct lp_build_context *bld,
1697 LLVMValueRef x)
1698 {
1699 LLVMValueRef res;
1700 if (bld->type.length == 1) {
1701 lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
1702 }
1703 else {
1704 lp_build_log2_approx(bld, x, NULL, NULL, &res);
1705 }
1706 return res;
1707 }