gallivm: Even more type checking
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
60
61
62 /**
63 * Generate min(a, b)
64 * No checks for special case values of a or b = 1 or 0 are done.
65 */
66 static LLVMValueRef
67 lp_build_min_simple(struct lp_build_context *bld,
68 LLVMValueRef a,
69 LLVMValueRef b)
70 {
71 const struct lp_type type = bld->type;
72 const char *intrinsic = NULL;
73 LLVMValueRef cond;
74
75 assert(lp_check_value(type, a));
76 assert(lp_check_value(type, b));
77
78 /* TODO: optimize the constant case */
79
80 if(type.width * type.length == 128) {
81 if(type.floating) {
82 if(type.width == 32 && util_cpu_caps.has_sse)
83 intrinsic = "llvm.x86.sse.min.ps";
84 if(type.width == 64 && util_cpu_caps.has_sse2)
85 intrinsic = "llvm.x86.sse2.min.pd";
86 }
87 else {
88 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
89 intrinsic = "llvm.x86.sse2.pminu.b";
90 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
91 intrinsic = "llvm.x86.sse41.pminsb";
92 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
93 intrinsic = "llvm.x86.sse41.pminuw";
94 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
95 intrinsic = "llvm.x86.sse2.pmins.w";
96 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
97 intrinsic = "llvm.x86.sse41.pminud";
98 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminsd";
100 }
101 }
102
103 if(intrinsic)
104 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
105
106 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
107 return lp_build_select(bld, cond, a, b);
108 }
109
110
111 /**
112 * Generate max(a, b)
113 * No checks for special case values of a or b = 1 or 0 are done.
114 */
115 static LLVMValueRef
116 lp_build_max_simple(struct lp_build_context *bld,
117 LLVMValueRef a,
118 LLVMValueRef b)
119 {
120 const struct lp_type type = bld->type;
121 const char *intrinsic = NULL;
122 LLVMValueRef cond;
123
124 assert(lp_check_value(type, a));
125 assert(lp_check_value(type, b));
126
127 /* TODO: optimize the constant case */
128
129 if(type.width * type.length == 128) {
130 if(type.floating) {
131 if(type.width == 32 && util_cpu_caps.has_sse)
132 intrinsic = "llvm.x86.sse.max.ps";
133 if(type.width == 64 && util_cpu_caps.has_sse2)
134 intrinsic = "llvm.x86.sse2.max.pd";
135 }
136 else {
137 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
138 intrinsic = "llvm.x86.sse2.pmaxu.b";
139 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
140 intrinsic = "llvm.x86.sse41.pmaxsb";
141 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
142 intrinsic = "llvm.x86.sse41.pmaxuw";
143 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
144 intrinsic = "llvm.x86.sse2.pmaxs.w";
145 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
146 intrinsic = "llvm.x86.sse41.pmaxud";
147 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
148 intrinsic = "llvm.x86.sse41.pmaxsd";
149 }
150 }
151
152 if(intrinsic)
153 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
154
155 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
156 return lp_build_select(bld, cond, a, b);
157 }
158
159
160 /**
161 * Generate 1 - a, or ~a depending on bld->type.
162 */
163 LLVMValueRef
164 lp_build_comp(struct lp_build_context *bld,
165 LLVMValueRef a)
166 {
167 const struct lp_type type = bld->type;
168
169 assert(lp_check_value(type, a));
170
171 if(a == bld->one)
172 return bld->zero;
173 if(a == bld->zero)
174 return bld->one;
175
176 if(type.norm && !type.floating && !type.fixed && !type.sign) {
177 if(LLVMIsConstant(a))
178 return LLVMConstNot(a);
179 else
180 return LLVMBuildNot(bld->builder, a, "");
181 }
182
183 if(LLVMIsConstant(a))
184 if (type.floating)
185 return LLVMConstFSub(bld->one, a);
186 else
187 return LLVMConstSub(bld->one, a);
188 else
189 if (type.floating)
190 return LLVMBuildFSub(bld->builder, bld->one, a, "");
191 else
192 return LLVMBuildSub(bld->builder, bld->one, a, "");
193 }
194
195
196 /**
197 * Generate a + b
198 */
199 LLVMValueRef
200 lp_build_add(struct lp_build_context *bld,
201 LLVMValueRef a,
202 LLVMValueRef b)
203 {
204 const struct lp_type type = bld->type;
205 LLVMValueRef res;
206
207 assert(lp_check_value(type, a));
208 assert(lp_check_value(type, b));
209
210 if(a == bld->zero)
211 return b;
212 if(b == bld->zero)
213 return a;
214 if(a == bld->undef || b == bld->undef)
215 return bld->undef;
216
217 if(bld->type.norm) {
218 const char *intrinsic = NULL;
219
220 if(a == bld->one || b == bld->one)
221 return bld->one;
222
223 if(util_cpu_caps.has_sse2 &&
224 type.width * type.length == 128 &&
225 !type.floating && !type.fixed) {
226 if(type.width == 8)
227 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
228 if(type.width == 16)
229 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
230 }
231
232 if(intrinsic)
233 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
234 }
235
236 if(LLVMIsConstant(a) && LLVMIsConstant(b))
237 if (type.floating)
238 res = LLVMConstFAdd(a, b);
239 else
240 res = LLVMConstAdd(a, b);
241 else
242 if (type.floating)
243 res = LLVMBuildFAdd(bld->builder, a, b, "");
244 else
245 res = LLVMBuildAdd(bld->builder, a, b, "");
246
247 /* clamp to ceiling of 1.0 */
248 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
249 res = lp_build_min_simple(bld, res, bld->one);
250
251 /* XXX clamp to floor of -1 or 0??? */
252
253 return res;
254 }
255
256
257 /** Return the sum of the elements of a */
258 LLVMValueRef
259 lp_build_sum_vector(struct lp_build_context *bld,
260 LLVMValueRef a)
261 {
262 const struct lp_type type = bld->type;
263 LLVMValueRef index, res;
264 unsigned i;
265
266 assert(lp_check_value(type, a));
267
268 if (a == bld->zero)
269 return bld->zero;
270 if (a == bld->undef)
271 return bld->undef;
272 assert(type.length > 1);
273
274 assert(!bld->type.norm);
275
276 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
277 res = LLVMBuildExtractElement(bld->builder, a, index, "");
278
279 for (i = 1; i < type.length; i++) {
280 index = LLVMConstInt(LLVMInt32Type(), i, 0);
281 if (type.floating)
282 res = LLVMBuildFAdd(bld->builder, res,
283 LLVMBuildExtractElement(bld->builder,
284 a, index, ""),
285 "");
286 else
287 res = LLVMBuildAdd(bld->builder, res,
288 LLVMBuildExtractElement(bld->builder,
289 a, index, ""),
290 "");
291 }
292
293 return res;
294 }
295
296
297 /**
298 * Generate a - b
299 */
300 LLVMValueRef
301 lp_build_sub(struct lp_build_context *bld,
302 LLVMValueRef a,
303 LLVMValueRef b)
304 {
305 const struct lp_type type = bld->type;
306 LLVMValueRef res;
307
308 assert(lp_check_value(type, a));
309 assert(lp_check_value(type, b));
310
311 if(b == bld->zero)
312 return a;
313 if(a == bld->undef || b == bld->undef)
314 return bld->undef;
315 if(a == b)
316 return bld->zero;
317
318 if(bld->type.norm) {
319 const char *intrinsic = NULL;
320
321 if(b == bld->one)
322 return bld->zero;
323
324 if(util_cpu_caps.has_sse2 &&
325 type.width * type.length == 128 &&
326 !type.floating && !type.fixed) {
327 if(type.width == 8)
328 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
329 if(type.width == 16)
330 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
331 }
332
333 if(intrinsic)
334 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
335 }
336
337 if(LLVMIsConstant(a) && LLVMIsConstant(b))
338 if (type.floating)
339 res = LLVMConstFSub(a, b);
340 else
341 res = LLVMConstSub(a, b);
342 else
343 if (type.floating)
344 res = LLVMBuildFSub(bld->builder, a, b, "");
345 else
346 res = LLVMBuildSub(bld->builder, a, b, "");
347
348 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
349 res = lp_build_max_simple(bld, res, bld->zero);
350
351 return res;
352 }
353
354
355 /**
356 * Normalized 8bit multiplication.
357 *
358 * - alpha plus one
359 *
360 * makes the following approximation to the division (Sree)
361 *
362 * a*b/255 ~= (a*(b + 1)) >> 256
363 *
364 * which is the fastest method that satisfies the following OpenGL criteria
365 *
366 * 0*0 = 0 and 255*255 = 255
367 *
368 * - geometric series
369 *
370 * takes the geometric series approximation to the division
371 *
372 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
373 *
374 * in this case just the first two terms to fit in 16bit arithmetic
375 *
376 * t/255 ~= (t + (t >> 8)) >> 8
377 *
378 * note that just by itself it doesn't satisfies the OpenGL criteria, as
379 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
380 * must be used
381 *
382 * - geometric series plus rounding
383 *
384 * when using a geometric series division instead of truncating the result
385 * use roundoff in the approximation (Jim Blinn)
386 *
387 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
388 *
389 * achieving the exact results
390 *
391 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
392 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
393 * @sa Michael Herf, The "double blend trick", May 2000,
394 * http://www.stereopsis.com/doubleblend.html
395 */
396 static LLVMValueRef
397 lp_build_mul_u8n(LLVMBuilderRef builder,
398 struct lp_type i16_type,
399 LLVMValueRef a, LLVMValueRef b)
400 {
401 LLVMValueRef c8;
402 LLVMValueRef ab;
403
404 assert(!i16_type.floating);
405 assert(lp_check_value(i16_type, a));
406 assert(lp_check_value(i16_type, b));
407
408 c8 = lp_build_const_int_vec(i16_type, 8);
409
410 #if 0
411
412 /* a*b/255 ~= (a*(b + 1)) >> 256 */
413 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
414 ab = LLVMBuildMul(builder, a, b, "");
415
416 #else
417
418 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
419 ab = LLVMBuildMul(builder, a, b, "");
420 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
421 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
422
423 #endif
424
425 ab = LLVMBuildLShr(builder, ab, c8, "");
426
427 return ab;
428 }
429
430
431 /**
432 * Generate a * b
433 */
434 LLVMValueRef
435 lp_build_mul(struct lp_build_context *bld,
436 LLVMValueRef a,
437 LLVMValueRef b)
438 {
439 const struct lp_type type = bld->type;
440 LLVMValueRef shift;
441 LLVMValueRef res;
442
443 assert(lp_check_value(type, a));
444 assert(lp_check_value(type, b));
445
446 if(a == bld->zero)
447 return bld->zero;
448 if(a == bld->one)
449 return b;
450 if(b == bld->zero)
451 return bld->zero;
452 if(b == bld->one)
453 return a;
454 if(a == bld->undef || b == bld->undef)
455 return bld->undef;
456
457 if(!type.floating && !type.fixed && type.norm) {
458 if(type.width == 8) {
459 struct lp_type i16_type = lp_wider_type(type);
460 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
461
462 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
463 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
464
465 /* PMULLW, PSRLW, PADDW */
466 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
467 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
468
469 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
470
471 return ab;
472 }
473
474 /* FIXME */
475 assert(0);
476 }
477
478 if(type.fixed)
479 shift = lp_build_const_int_vec(type, type.width/2);
480 else
481 shift = NULL;
482
483 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
484 if (type.floating)
485 res = LLVMConstFMul(a, b);
486 else
487 res = LLVMConstMul(a, b);
488 if(shift) {
489 if(type.sign)
490 res = LLVMConstAShr(res, shift);
491 else
492 res = LLVMConstLShr(res, shift);
493 }
494 }
495 else {
496 if (type.floating)
497 res = LLVMBuildFMul(bld->builder, a, b, "");
498 else
499 res = LLVMBuildMul(bld->builder, a, b, "");
500 if(shift) {
501 if(type.sign)
502 res = LLVMBuildAShr(bld->builder, res, shift, "");
503 else
504 res = LLVMBuildLShr(bld->builder, res, shift, "");
505 }
506 }
507
508 return res;
509 }
510
511
512 /**
513 * Small vector x scale multiplication optimization.
514 */
515 LLVMValueRef
516 lp_build_mul_imm(struct lp_build_context *bld,
517 LLVMValueRef a,
518 int b)
519 {
520 LLVMValueRef factor;
521
522 assert(lp_check_value(bld->type, a));
523
524 if(b == 0)
525 return bld->zero;
526
527 if(b == 1)
528 return a;
529
530 if(b == -1)
531 return lp_build_negate(bld, a);
532
533 if(b == 2 && bld->type.floating)
534 return lp_build_add(bld, a, a);
535
536 if(util_is_pot(b)) {
537 unsigned shift = ffs(b) - 1;
538
539 if(bld->type.floating) {
540 #if 0
541 /*
542 * Power of two multiplication by directly manipulating the mantissa.
543 *
544 * XXX: This might not be always faster, it will introduce a small error
545 * for multiplication by zero, and it will produce wrong results
546 * for Inf and NaN.
547 */
548 unsigned mantissa = lp_mantissa(bld->type);
549 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
550 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
551 a = LLVMBuildAdd(bld->builder, a, factor, "");
552 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
553 return a;
554 #endif
555 }
556 else {
557 factor = lp_build_const_vec(bld->type, shift);
558 return LLVMBuildShl(bld->builder, a, factor, "");
559 }
560 }
561
562 factor = lp_build_const_vec(bld->type, (double)b);
563 return lp_build_mul(bld, a, factor);
564 }
565
566
567 /**
568 * Generate a / b
569 */
570 LLVMValueRef
571 lp_build_div(struct lp_build_context *bld,
572 LLVMValueRef a,
573 LLVMValueRef b)
574 {
575 const struct lp_type type = bld->type;
576
577 assert(lp_check_value(type, a));
578 assert(lp_check_value(type, b));
579
580 if(a == bld->zero)
581 return bld->zero;
582 if(a == bld->one)
583 return lp_build_rcp(bld, b);
584 if(b == bld->zero)
585 return bld->undef;
586 if(b == bld->one)
587 return a;
588 if(a == bld->undef || b == bld->undef)
589 return bld->undef;
590
591 if(LLVMIsConstant(a) && LLVMIsConstant(b))
592 return LLVMConstFDiv(a, b);
593
594 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
595 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
596
597 return LLVMBuildFDiv(bld->builder, a, b, "");
598 }
599
600
601 /**
602 * Linear interpolation.
603 *
604 * This also works for integer values with a few caveats.
605 *
606 * @sa http://www.stereopsis.com/doubleblend.html
607 */
608 LLVMValueRef
609 lp_build_lerp(struct lp_build_context *bld,
610 LLVMValueRef x,
611 LLVMValueRef v0,
612 LLVMValueRef v1)
613 {
614 LLVMValueRef delta;
615 LLVMValueRef res;
616
617 assert(lp_check_value(bld->type, x));
618 assert(lp_check_value(bld->type, v0));
619 assert(lp_check_value(bld->type, v1));
620
621 delta = lp_build_sub(bld, v1, v0);
622
623 res = lp_build_mul(bld, x, delta);
624
625 res = lp_build_add(bld, v0, res);
626
627 if(bld->type.fixed)
628 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
629 * but it will be wrong for other uses. Basically we need a more
630 * powerful lp_type, capable of further distinguishing the values
631 * interpretation from the value storage. */
632 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
633
634 return res;
635 }
636
637
638 LLVMValueRef
639 lp_build_lerp_2d(struct lp_build_context *bld,
640 LLVMValueRef x,
641 LLVMValueRef y,
642 LLVMValueRef v00,
643 LLVMValueRef v01,
644 LLVMValueRef v10,
645 LLVMValueRef v11)
646 {
647 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
648 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
649 return lp_build_lerp(bld, y, v0, v1);
650 }
651
652
653 /**
654 * Generate min(a, b)
655 * Do checks for special cases.
656 */
657 LLVMValueRef
658 lp_build_min(struct lp_build_context *bld,
659 LLVMValueRef a,
660 LLVMValueRef b)
661 {
662 assert(lp_check_value(bld->type, a));
663 assert(lp_check_value(bld->type, b));
664
665 if(a == bld->undef || b == bld->undef)
666 return bld->undef;
667
668 if(a == b)
669 return a;
670
671 if(bld->type.norm) {
672 if(a == bld->zero || b == bld->zero)
673 return bld->zero;
674 if(a == bld->one)
675 return b;
676 if(b == bld->one)
677 return a;
678 }
679
680 return lp_build_min_simple(bld, a, b);
681 }
682
683
684 /**
685 * Generate max(a, b)
686 * Do checks for special cases.
687 */
688 LLVMValueRef
689 lp_build_max(struct lp_build_context *bld,
690 LLVMValueRef a,
691 LLVMValueRef b)
692 {
693 assert(lp_check_value(bld->type, a));
694 assert(lp_check_value(bld->type, b));
695
696 if(a == bld->undef || b == bld->undef)
697 return bld->undef;
698
699 if(a == b)
700 return a;
701
702 if(bld->type.norm) {
703 if(a == bld->one || b == bld->one)
704 return bld->one;
705 if(a == bld->zero)
706 return b;
707 if(b == bld->zero)
708 return a;
709 }
710
711 return lp_build_max_simple(bld, a, b);
712 }
713
714
715 /**
716 * Generate clamp(a, min, max)
717 * Do checks for special cases.
718 */
719 LLVMValueRef
720 lp_build_clamp(struct lp_build_context *bld,
721 LLVMValueRef a,
722 LLVMValueRef min,
723 LLVMValueRef max)
724 {
725 assert(lp_check_value(bld->type, a));
726 assert(lp_check_value(bld->type, min));
727 assert(lp_check_value(bld->type, max));
728
729 a = lp_build_min(bld, a, max);
730 a = lp_build_max(bld, a, min);
731 return a;
732 }
733
734
735 /**
736 * Generate abs(a)
737 */
738 LLVMValueRef
739 lp_build_abs(struct lp_build_context *bld,
740 LLVMValueRef a)
741 {
742 const struct lp_type type = bld->type;
743 LLVMTypeRef vec_type = lp_build_vec_type(type);
744
745 assert(lp_check_value(type, a));
746
747 if(!type.sign)
748 return a;
749
750 if(type.floating) {
751 /* Mask out the sign bit */
752 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
753 unsigned long long absMask = ~(1ULL << (type.width - 1));
754 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
755 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
756 a = LLVMBuildAnd(bld->builder, a, mask, "");
757 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
758 return a;
759 }
760
761 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
762 switch(type.width) {
763 case 8:
764 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
765 case 16:
766 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
767 case 32:
768 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
769 }
770 }
771
772 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
773 }
774
775
776 LLVMValueRef
777 lp_build_negate(struct lp_build_context *bld,
778 LLVMValueRef a)
779 {
780 assert(lp_check_value(bld->type, a));
781
782 #if HAVE_LLVM >= 0x0207
783 if (bld->type.floating)
784 a = LLVMBuildFNeg(bld->builder, a, "");
785 else
786 #endif
787 a = LLVMBuildNeg(bld->builder, a, "");
788
789 return a;
790 }
791
792
793 /** Return -1, 0 or +1 depending on the sign of a */
794 LLVMValueRef
795 lp_build_sgn(struct lp_build_context *bld,
796 LLVMValueRef a)
797 {
798 const struct lp_type type = bld->type;
799 LLVMValueRef cond;
800 LLVMValueRef res;
801
802 assert(lp_check_value(type, a));
803
804 /* Handle non-zero case */
805 if(!type.sign) {
806 /* if not zero then sign must be positive */
807 res = bld->one;
808 }
809 else if(type.floating) {
810 LLVMTypeRef vec_type;
811 LLVMTypeRef int_type;
812 LLVMValueRef mask;
813 LLVMValueRef sign;
814 LLVMValueRef one;
815 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
816
817 int_type = lp_build_int_vec_type(type);
818 vec_type = lp_build_vec_type(type);
819 mask = lp_build_const_int_vec(type, maskBit);
820
821 /* Take the sign bit and add it to 1 constant */
822 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
823 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
824 one = LLVMConstBitCast(bld->one, int_type);
825 res = LLVMBuildOr(bld->builder, sign, one, "");
826 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
827 }
828 else
829 {
830 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
831 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
832 res = lp_build_select(bld, cond, bld->one, minus_one);
833 }
834
835 /* Handle zero */
836 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
837 res = lp_build_select(bld, cond, bld->zero, res);
838
839 return res;
840 }
841
842
843 /**
844 * Set the sign of float vector 'a' according to 'sign'.
845 * If sign==0, return abs(a).
846 * If sign==1, return -abs(a);
847 * Other values for sign produce undefined results.
848 */
849 LLVMValueRef
850 lp_build_set_sign(struct lp_build_context *bld,
851 LLVMValueRef a, LLVMValueRef sign)
852 {
853 const struct lp_type type = bld->type;
854 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
855 LLVMTypeRef vec_type = lp_build_vec_type(type);
856 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
857 LLVMValueRef mask = lp_build_const_int_vec(type,
858 ~((unsigned long long) 1 << (type.width - 1)));
859 LLVMValueRef val, res;
860
861 assert(type.floating);
862 assert(lp_check_value(type, a));
863
864 /* val = reinterpret_cast<int>(a) */
865 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
866 /* val = val & mask */
867 val = LLVMBuildAnd(bld->builder, val, mask, "");
868 /* sign = sign << shift */
869 sign = LLVMBuildShl(bld->builder, sign, shift, "");
870 /* res = val | sign */
871 res = LLVMBuildOr(bld->builder, val, sign, "");
872 /* res = reinterpret_cast<float>(res) */
873 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
874
875 return res;
876 }
877
878
879 /**
880 * Convert vector of (or scalar) int to vector of (or scalar) float.
881 */
882 LLVMValueRef
883 lp_build_int_to_float(struct lp_build_context *bld,
884 LLVMValueRef a)
885 {
886 const struct lp_type type = bld->type;
887 LLVMTypeRef vec_type = lp_build_vec_type(type);
888
889 assert(type.floating);
890
891 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
892 }
893
894
895
896 enum lp_build_round_sse41_mode
897 {
898 LP_BUILD_ROUND_SSE41_NEAREST = 0,
899 LP_BUILD_ROUND_SSE41_FLOOR = 1,
900 LP_BUILD_ROUND_SSE41_CEIL = 2,
901 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
902 };
903
904
905 static INLINE LLVMValueRef
906 lp_build_round_sse41(struct lp_build_context *bld,
907 LLVMValueRef a,
908 enum lp_build_round_sse41_mode mode)
909 {
910 const struct lp_type type = bld->type;
911 LLVMTypeRef vec_type = lp_build_vec_type(type);
912 const char *intrinsic;
913
914 assert(type.floating);
915 assert(type.width*type.length == 128);
916 assert(lp_check_value(type, a));
917 assert(util_cpu_caps.has_sse4_1);
918
919 switch(type.width) {
920 case 32:
921 intrinsic = "llvm.x86.sse41.round.ps";
922 break;
923 case 64:
924 intrinsic = "llvm.x86.sse41.round.pd";
925 break;
926 default:
927 assert(0);
928 return bld->undef;
929 }
930
931 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
932 LLVMConstInt(LLVMInt32Type(), mode, 0));
933 }
934
935
936 /**
937 * Return the integer part of a float (vector) value. The returned value is
938 * a float (vector).
939 * Ex: trunc(-1.5) = 1.0
940 */
941 LLVMValueRef
942 lp_build_trunc(struct lp_build_context *bld,
943 LLVMValueRef a)
944 {
945 const struct lp_type type = bld->type;
946
947 assert(type.floating);
948 assert(lp_check_value(type, a));
949
950 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
951 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
952 else {
953 LLVMTypeRef vec_type = lp_build_vec_type(type);
954 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
955 LLVMValueRef res;
956 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
957 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
958 return res;
959 }
960 }
961
962
963 /**
964 * Return float (vector) rounded to nearest integer (vector). The returned
965 * value is a float (vector).
966 * Ex: round(0.9) = 1.0
967 * Ex: round(-1.5) = -2.0
968 */
969 LLVMValueRef
970 lp_build_round(struct lp_build_context *bld,
971 LLVMValueRef a)
972 {
973 const struct lp_type type = bld->type;
974
975 assert(type.floating);
976 assert(lp_check_value(type, a));
977
978 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
979 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
980 else {
981 LLVMTypeRef vec_type = lp_build_vec_type(type);
982 LLVMValueRef res;
983 res = lp_build_iround(bld, a);
984 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
985 return res;
986 }
987 }
988
989
990 /**
991 * Return floor of float (vector), result is a float (vector)
992 * Ex: floor(1.1) = 1.0
993 * Ex: floor(-1.1) = -2.0
994 */
995 LLVMValueRef
996 lp_build_floor(struct lp_build_context *bld,
997 LLVMValueRef a)
998 {
999 const struct lp_type type = bld->type;
1000
1001 assert(type.floating);
1002 assert(lp_check_value(type, a));
1003
1004 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1005 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1006 else {
1007 LLVMTypeRef vec_type = lp_build_vec_type(type);
1008 LLVMValueRef res;
1009 res = lp_build_ifloor(bld, a);
1010 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1011 return res;
1012 }
1013 }
1014
1015
1016 /**
1017 * Return ceiling of float (vector), returning float (vector).
1018 * Ex: ceil( 1.1) = 2.0
1019 * Ex: ceil(-1.1) = -1.0
1020 */
1021 LLVMValueRef
1022 lp_build_ceil(struct lp_build_context *bld,
1023 LLVMValueRef a)
1024 {
1025 const struct lp_type type = bld->type;
1026
1027 assert(type.floating);
1028 assert(lp_check_value(type, a));
1029
1030 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1031 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1032 else {
1033 LLVMTypeRef vec_type = lp_build_vec_type(type);
1034 LLVMValueRef res;
1035 res = lp_build_iceil(bld, a);
1036 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1037 return res;
1038 }
1039 }
1040
1041
1042 /**
1043 * Return fractional part of 'a' computed as a - floor(a)
1044 * Typically used in texture coord arithmetic.
1045 */
1046 LLVMValueRef
1047 lp_build_fract(struct lp_build_context *bld,
1048 LLVMValueRef a)
1049 {
1050 assert(bld->type.floating);
1051 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1052 }
1053
1054
1055 /**
1056 * Return the integer part of a float (vector) value. The returned value is
1057 * an integer (vector).
1058 * Ex: itrunc(-1.5) = 1
1059 */
1060 LLVMValueRef
1061 lp_build_itrunc(struct lp_build_context *bld,
1062 LLVMValueRef a)
1063 {
1064 const struct lp_type type = bld->type;
1065 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1066
1067 assert(type.floating);
1068 assert(lp_check_value(type, a));
1069
1070 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1071 }
1072
1073
1074 /**
1075 * Return float (vector) rounded to nearest integer (vector). The returned
1076 * value is an integer (vector).
1077 * Ex: iround(0.9) = 1
1078 * Ex: iround(-1.5) = -2
1079 */
1080 LLVMValueRef
1081 lp_build_iround(struct lp_build_context *bld,
1082 LLVMValueRef a)
1083 {
1084 const struct lp_type type = bld->type;
1085 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1086 LLVMValueRef res;
1087
1088 assert(type.floating);
1089
1090 assert(lp_check_value(type, a));
1091
1092 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1093 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1094 }
1095 else {
1096 LLVMTypeRef vec_type = lp_build_vec_type(type);
1097 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1098 LLVMValueRef sign;
1099 LLVMValueRef half;
1100
1101 /* get sign bit */
1102 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1103 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1104
1105 /* sign * 0.5 */
1106 half = lp_build_const_vec(type, 0.5);
1107 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1108 half = LLVMBuildOr(bld->builder, sign, half, "");
1109 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1110
1111 res = LLVMBuildFAdd(bld->builder, a, half, "");
1112 }
1113
1114 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1115
1116 return res;
1117 }
1118
1119
1120 /**
1121 * Return floor of float (vector), result is an int (vector)
1122 * Ex: ifloor(1.1) = 1.0
1123 * Ex: ifloor(-1.1) = -2.0
1124 */
1125 LLVMValueRef
1126 lp_build_ifloor(struct lp_build_context *bld,
1127 LLVMValueRef a)
1128 {
1129 const struct lp_type type = bld->type;
1130 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1131 LLVMValueRef res;
1132
1133 assert(type.floating);
1134 assert(lp_check_value(type, a));
1135
1136 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1137 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1138 }
1139 else {
1140 /* Take the sign bit and add it to 1 constant */
1141 LLVMTypeRef vec_type = lp_build_vec_type(type);
1142 unsigned mantissa = lp_mantissa(type);
1143 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1144 LLVMValueRef sign;
1145 LLVMValueRef offset;
1146
1147 /* sign = a < 0 ? ~0 : 0 */
1148 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1149 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1150 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1151
1152 /* offset = -0.99999(9)f */
1153 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1154 offset = LLVMConstBitCast(offset, int_vec_type);
1155
1156 /* offset = a < 0 ? offset : 0.0f */
1157 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1158 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1159
1160 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1161 }
1162
1163 /* round to nearest (toward zero) */
1164 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1165
1166 return res;
1167 }
1168
1169
1170 /**
1171 * Return ceiling of float (vector), returning int (vector).
1172 * Ex: iceil( 1.1) = 2
1173 * Ex: iceil(-1.1) = -1
1174 */
1175 LLVMValueRef
1176 lp_build_iceil(struct lp_build_context *bld,
1177 LLVMValueRef a)
1178 {
1179 const struct lp_type type = bld->type;
1180 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1181 LLVMValueRef res;
1182
1183 assert(type.floating);
1184 assert(lp_check_value(type, a));
1185
1186 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1187 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1188 }
1189 else {
1190 LLVMTypeRef vec_type = lp_build_vec_type(type);
1191 unsigned mantissa = lp_mantissa(type);
1192 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1193 LLVMValueRef sign;
1194 LLVMValueRef offset;
1195
1196 /* sign = a < 0 ? 0 : ~0 */
1197 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1198 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1199 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1200 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1201
1202 /* offset = 0.99999(9)f */
1203 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1204 offset = LLVMConstBitCast(offset, int_vec_type);
1205
1206 /* offset = a < 0 ? 0.0 : offset */
1207 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1208 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1209
1210 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1211 }
1212
1213 /* round to nearest (toward zero) */
1214 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1215
1216 return res;
1217 }
1218
1219
1220 LLVMValueRef
1221 lp_build_sqrt(struct lp_build_context *bld,
1222 LLVMValueRef a)
1223 {
1224 const struct lp_type type = bld->type;
1225 LLVMTypeRef vec_type = lp_build_vec_type(type);
1226 char intrinsic[32];
1227
1228 assert(lp_check_value(type, a));
1229
1230 /* TODO: optimize the constant case */
1231 /* TODO: optimize the constant case */
1232
1233 assert(type.floating);
1234 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1235
1236 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1237 }
1238
1239
1240 LLVMValueRef
1241 lp_build_rcp(struct lp_build_context *bld,
1242 LLVMValueRef a)
1243 {
1244 const struct lp_type type = bld->type;
1245
1246 assert(lp_check_value(type, a));
1247
1248 if(a == bld->zero)
1249 return bld->undef;
1250 if(a == bld->one)
1251 return bld->one;
1252 if(a == bld->undef)
1253 return bld->undef;
1254
1255 assert(type.floating);
1256
1257 if(LLVMIsConstant(a))
1258 return LLVMConstFDiv(bld->one, a);
1259
1260 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1261 /*
1262 * XXX: Added precision is not always necessary, so only enable this
1263 * when we have a better system in place to track minimum precision.
1264 */
1265
1266 #if 0
1267 /*
1268 * Do one Newton-Raphson step to improve precision:
1269 *
1270 * x1 = (2 - a * rcp(a)) * rcp(a)
1271 */
1272
1273 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1274 LLVMValueRef rcp_a;
1275 LLVMValueRef res;
1276
1277 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1278
1279 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1280 res = LLVMBuildFSub(bld->builder, two, res, "");
1281 res = LLVMBuildFMul(bld->builder, res, rcp_a, "");
1282
1283 return rcp_a;
1284 #else
1285 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1286 #endif
1287 }
1288
1289 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1290 }
1291
1292
1293 /**
1294 * Generate 1/sqrt(a)
1295 */
1296 LLVMValueRef
1297 lp_build_rsqrt(struct lp_build_context *bld,
1298 LLVMValueRef a)
1299 {
1300 const struct lp_type type = bld->type;
1301
1302 assert(lp_check_value(type, a));
1303
1304 assert(type.floating);
1305
1306 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1307 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1308
1309 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1310 }
1311
1312
1313 static inline LLVMValueRef
1314 lp_build_const_v4si(unsigned long value)
1315 {
1316 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1317 LLVMValueRef elements[4] = { element, element, element, element };
1318 return LLVMConstVector(elements, 4);
1319 }
1320
1321 static inline LLVMValueRef
1322 lp_build_const_v4sf(float value)
1323 {
1324 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1325 LLVMValueRef elements[4] = { element, element, element, element };
1326 return LLVMConstVector(elements, 4);
1327 }
1328
1329
1330 /**
1331 * Generate sin(a) using SSE2
1332 */
1333 LLVMValueRef
1334 lp_build_sin(struct lp_build_context *bld,
1335 LLVMValueRef a)
1336 {
1337 struct lp_type int_type = lp_int_type(bld->type);
1338 LLVMBuilderRef b = bld->builder;
1339 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1340 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1341
1342 /*
1343 * take the absolute value,
1344 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1345 */
1346
1347 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1348 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1349
1350 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1351 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1352
1353 /*
1354 * extract the sign bit (upper one)
1355 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1356 */
1357 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1358 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1359
1360 /*
1361 * scale by 4/Pi
1362 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1363 */
1364
1365 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1366 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1367
1368 /*
1369 * store the integer part of y in mm0
1370 * emm2 = _mm_cvttps_epi32(y);
1371 */
1372
1373 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1374
1375 /*
1376 * j=(j+1) & (~1) (see the cephes sources)
1377 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1378 */
1379
1380 LLVMValueRef all_one = lp_build_const_v4si(1);
1381 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1382 /*
1383 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1384 */
1385 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1386 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1387
1388 /*
1389 * y = _mm_cvtepi32_ps(emm2);
1390 */
1391 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1392
1393 /* get the swap sign flag
1394 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1395 */
1396 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1397 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1398
1399 /*
1400 * emm2 = _mm_slli_epi32(emm0, 29);
1401 */
1402 LLVMValueRef const_29 = lp_build_const_v4si(29);
1403 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1404
1405 /*
1406 * get the polynom selection mask
1407 * there is one polynom for 0 <= x <= Pi/4
1408 * and another one for Pi/4<x<=Pi/2
1409 * Both branches will be computed.
1410 *
1411 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1412 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1413 */
1414
1415 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1416 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1417 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1418 emm2_3, lp_build_const_v4si(0));
1419 /*
1420 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1421 */
1422 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1423
1424 /*
1425 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1426 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1427 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1428 */
1429 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1430 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1431 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1432
1433 /*
1434 * The magic pass: "Extended precision modular arithmetic"
1435 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1436 * xmm1 = _mm_mul_ps(y, xmm1);
1437 * xmm2 = _mm_mul_ps(y, xmm2);
1438 * xmm3 = _mm_mul_ps(y, xmm3);
1439 */
1440 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1441 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1442 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1443
1444 /*
1445 * x = _mm_add_ps(x, xmm1);
1446 * x = _mm_add_ps(x, xmm2);
1447 * x = _mm_add_ps(x, xmm3);
1448 */
1449
1450 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1451 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1452 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1453
1454 /*
1455 * Evaluate the first polynom (0 <= x <= Pi/4)
1456 *
1457 * z = _mm_mul_ps(x,x);
1458 */
1459 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1460
1461 /*
1462 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1463 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1464 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1465 */
1466 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1467 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1468 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1469
1470 /*
1471 * y = *(v4sf*)_ps_coscof_p0;
1472 * y = _mm_mul_ps(y, z);
1473 */
1474 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1475 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1476 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1477 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1478 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1479 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1480
1481
1482 /*
1483 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1484 * y = _mm_sub_ps(y, tmp);
1485 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1486 */
1487 LLVMValueRef half = lp_build_const_v4sf(0.5);
1488 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1489 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1490 LLVMValueRef one = lp_build_const_v4sf(1.0);
1491 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1492
1493 /*
1494 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1495 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1496 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1497 */
1498 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1499 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1500 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1501
1502 /*
1503 * Evaluate the second polynom (Pi/4 <= x <= 0)
1504 *
1505 * y2 = *(v4sf*)_ps_sincof_p0;
1506 * y2 = _mm_mul_ps(y2, z);
1507 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1508 * y2 = _mm_mul_ps(y2, z);
1509 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1510 * y2 = _mm_mul_ps(y2, z);
1511 * y2 = _mm_mul_ps(y2, x);
1512 * y2 = _mm_add_ps(y2, x);
1513 */
1514
1515 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1516 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1517 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1518 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1519 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1520 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1521 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1522
1523 /*
1524 * select the correct result from the two polynoms
1525 * xmm3 = poly_mask;
1526 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1527 * y = _mm_andnot_ps(xmm3, y);
1528 * y = _mm_add_ps(y,y2);
1529 */
1530 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1531 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1532 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1533 LLVMValueRef inv = lp_build_const_v4si(~0);
1534 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1535 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1536 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1537
1538 /*
1539 * update the sign
1540 * y = _mm_xor_ps(y, sign_bit);
1541 */
1542 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1543 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1544 return y_result;
1545 }
1546
1547
1548 /**
1549 * Generate cos(a) using SSE2
1550 */
1551 LLVMValueRef
1552 lp_build_cos(struct lp_build_context *bld,
1553 LLVMValueRef a)
1554 {
1555 struct lp_type int_type = lp_int_type(bld->type);
1556 LLVMBuilderRef b = bld->builder;
1557 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1558 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1559
1560 /*
1561 * take the absolute value,
1562 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1563 */
1564
1565 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1566 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1567
1568 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1569 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1570
1571 /*
1572 * scale by 4/Pi
1573 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1574 */
1575
1576 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1577 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1578
1579 /*
1580 * store the integer part of y in mm0
1581 * emm2 = _mm_cvttps_epi32(y);
1582 */
1583
1584 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1585
1586 /*
1587 * j=(j+1) & (~1) (see the cephes sources)
1588 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1589 */
1590
1591 LLVMValueRef all_one = lp_build_const_v4si(1);
1592 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1593 /*
1594 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1595 */
1596 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1597 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1598
1599 /*
1600 * y = _mm_cvtepi32_ps(emm2);
1601 */
1602 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1603
1604
1605 /*
1606 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1607 */
1608 LLVMValueRef const_2 = lp_build_const_v4si(2);
1609 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1610
1611
1612 /* get the swap sign flag
1613 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1614 */
1615 LLVMValueRef inv = lp_build_const_v4si(~0);
1616 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1617 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1618 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1619
1620 /*
1621 * emm2 = _mm_slli_epi32(emm0, 29);
1622 */
1623 LLVMValueRef const_29 = lp_build_const_v4si(29);
1624 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1625
1626 /*
1627 * get the polynom selection mask
1628 * there is one polynom for 0 <= x <= Pi/4
1629 * and another one for Pi/4<x<=Pi/2
1630 * Both branches will be computed.
1631 *
1632 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1633 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1634 */
1635
1636 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1637 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1638 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1639 emm2_3, lp_build_const_v4si(0));
1640
1641 /*
1642 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1643 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1644 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1645 */
1646 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1647 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1648 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1649
1650 /*
1651 * The magic pass: "Extended precision modular arithmetic"
1652 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1653 * xmm1 = _mm_mul_ps(y, xmm1);
1654 * xmm2 = _mm_mul_ps(y, xmm2);
1655 * xmm3 = _mm_mul_ps(y, xmm3);
1656 */
1657 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1658 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1659 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1660
1661 /*
1662 * x = _mm_add_ps(x, xmm1);
1663 * x = _mm_add_ps(x, xmm2);
1664 * x = _mm_add_ps(x, xmm3);
1665 */
1666
1667 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1668 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1669 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1670
1671 /*
1672 * Evaluate the first polynom (0 <= x <= Pi/4)
1673 *
1674 * z = _mm_mul_ps(x,x);
1675 */
1676 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1677
1678 /*
1679 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1680 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1681 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1682 */
1683 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1684 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1685 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1686
1687 /*
1688 * y = *(v4sf*)_ps_coscof_p0;
1689 * y = _mm_mul_ps(y, z);
1690 */
1691 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1692 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1693 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1694 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1695 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1696 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1697
1698
1699 /*
1700 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1701 * y = _mm_sub_ps(y, tmp);
1702 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1703 */
1704 LLVMValueRef half = lp_build_const_v4sf(0.5);
1705 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1706 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1707 LLVMValueRef one = lp_build_const_v4sf(1.0);
1708 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1709
1710 /*
1711 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1712 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1713 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1714 */
1715 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1716 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1717 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1718
1719 /*
1720 * Evaluate the second polynom (Pi/4 <= x <= 0)
1721 *
1722 * y2 = *(v4sf*)_ps_sincof_p0;
1723 * y2 = _mm_mul_ps(y2, z);
1724 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1725 * y2 = _mm_mul_ps(y2, z);
1726 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1727 * y2 = _mm_mul_ps(y2, z);
1728 * y2 = _mm_mul_ps(y2, x);
1729 * y2 = _mm_add_ps(y2, x);
1730 */
1731
1732 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1733 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1734 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1735 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1736 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1737 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1738 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1739
1740 /*
1741 * select the correct result from the two polynoms
1742 * xmm3 = poly_mask;
1743 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1744 * y = _mm_andnot_ps(xmm3, y);
1745 * y = _mm_add_ps(y,y2);
1746 */
1747 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1748 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1749 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1750 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1751 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1752 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1753
1754 /*
1755 * update the sign
1756 * y = _mm_xor_ps(y, sign_bit);
1757 */
1758 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1759 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1760 return y_result;
1761 }
1762
1763
1764 /**
1765 * Generate pow(x, y)
1766 */
1767 LLVMValueRef
1768 lp_build_pow(struct lp_build_context *bld,
1769 LLVMValueRef x,
1770 LLVMValueRef y)
1771 {
1772 /* TODO: optimize the constant case */
1773 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1774 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1775 __FUNCTION__);
1776
1777 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1778 }
1779
1780
1781 /**
1782 * Generate exp(x)
1783 */
1784 LLVMValueRef
1785 lp_build_exp(struct lp_build_context *bld,
1786 LLVMValueRef x)
1787 {
1788 /* log2(e) = 1/log(2) */
1789 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1790
1791 assert(lp_check_value(bld->type, x));
1792
1793 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1794 }
1795
1796
1797 /**
1798 * Generate log(x)
1799 */
1800 LLVMValueRef
1801 lp_build_log(struct lp_build_context *bld,
1802 LLVMValueRef x)
1803 {
1804 /* log(2) */
1805 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1806
1807 assert(lp_check_value(bld->type, x));
1808
1809 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1810 }
1811
1812
1813 #define EXP_POLY_DEGREE 3
1814 #define LOG_POLY_DEGREE 5
1815
1816
1817 /**
1818 * Generate polynomial.
1819 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1820 */
1821 static LLVMValueRef
1822 lp_build_polynomial(struct lp_build_context *bld,
1823 LLVMValueRef x,
1824 const double *coeffs,
1825 unsigned num_coeffs)
1826 {
1827 const struct lp_type type = bld->type;
1828 LLVMValueRef res = NULL;
1829 unsigned i;
1830
1831 assert(lp_check_value(bld->type, x));
1832
1833 /* TODO: optimize the constant case */
1834 if(LLVMIsConstant(x))
1835 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1836 __FUNCTION__);
1837
1838 for (i = num_coeffs; i--; ) {
1839 LLVMValueRef coeff;
1840
1841 coeff = lp_build_const_vec(type, coeffs[i]);
1842
1843 if(res)
1844 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1845 else
1846 res = coeff;
1847 }
1848
1849 if(res)
1850 return res;
1851 else
1852 return bld->undef;
1853 }
1854
1855
1856 /**
1857 * Minimax polynomial fit of 2**x, in range [0, 1[
1858 */
1859 const double lp_build_exp2_polynomial[] = {
1860 #if EXP_POLY_DEGREE == 5
1861 0.999999999690134838155,
1862 0.583974334321735217258,
1863 0.164553105719676828492,
1864 0.0292811063701710962255,
1865 0.00354944426657875141846,
1866 0.000296253726543423377365
1867 #elif EXP_POLY_DEGREE == 4
1868 1.00000001502262084505,
1869 0.563586057338685991394,
1870 0.150436017652442413623,
1871 0.0243220604213317927308,
1872 0.0025359088446580436489
1873 #elif EXP_POLY_DEGREE == 3
1874 0.999925218562710312959,
1875 0.695833540494823811697,
1876 0.226067155427249155588,
1877 0.0780245226406372992967
1878 #elif EXP_POLY_DEGREE == 2
1879 1.00172476321474503578,
1880 0.657636275736077639316,
1881 0.33718943461968720704
1882 #else
1883 #error
1884 #endif
1885 };
1886
1887
1888 void
1889 lp_build_exp2_approx(struct lp_build_context *bld,
1890 LLVMValueRef x,
1891 LLVMValueRef *p_exp2_int_part,
1892 LLVMValueRef *p_frac_part,
1893 LLVMValueRef *p_exp2)
1894 {
1895 const struct lp_type type = bld->type;
1896 LLVMTypeRef vec_type = lp_build_vec_type(type);
1897 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1898 LLVMValueRef ipart = NULL;
1899 LLVMValueRef fpart = NULL;
1900 LLVMValueRef expipart = NULL;
1901 LLVMValueRef expfpart = NULL;
1902 LLVMValueRef res = NULL;
1903
1904 assert(lp_check_value(bld->type, x));
1905
1906 if(p_exp2_int_part || p_frac_part || p_exp2) {
1907 /* TODO: optimize the constant case */
1908 if(LLVMIsConstant(x))
1909 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1910 __FUNCTION__);
1911
1912 assert(type.floating && type.width == 32);
1913
1914 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1915 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1916
1917 /* ipart = floor(x) */
1918 ipart = lp_build_floor(bld, x);
1919
1920 /* fpart = x - ipart */
1921 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1922 }
1923
1924 if(p_exp2_int_part || p_exp2) {
1925 /* expipart = (float) (1 << ipart) */
1926 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1927 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1928 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1929 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1930 }
1931
1932 if(p_exp2) {
1933 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1934 Elements(lp_build_exp2_polynomial));
1935
1936 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
1937 }
1938
1939 if(p_exp2_int_part)
1940 *p_exp2_int_part = expipart;
1941
1942 if(p_frac_part)
1943 *p_frac_part = fpart;
1944
1945 if(p_exp2)
1946 *p_exp2 = res;
1947 }
1948
1949
1950 LLVMValueRef
1951 lp_build_exp2(struct lp_build_context *bld,
1952 LLVMValueRef x)
1953 {
1954 LLVMValueRef res;
1955 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1956 return res;
1957 }
1958
1959
1960 /**
1961 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1962 * These coefficients can be generate with
1963 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1964 */
1965 const double lp_build_log2_polynomial[] = {
1966 #if LOG_POLY_DEGREE == 6
1967 3.11578814719469302614,
1968 -3.32419399085241980044,
1969 2.59883907202499966007,
1970 -1.23152682416275988241,
1971 0.318212422185251071475,
1972 -0.0344359067839062357313
1973 #elif LOG_POLY_DEGREE == 5
1974 2.8882704548164776201,
1975 -2.52074962577807006663,
1976 1.48116647521213171641,
1977 -0.465725644288844778798,
1978 0.0596515482674574969533
1979 #elif LOG_POLY_DEGREE == 4
1980 2.61761038894603480148,
1981 -1.75647175389045657003,
1982 0.688243882994381274313,
1983 -0.107254423828329604454
1984 #elif LOG_POLY_DEGREE == 3
1985 2.28330284476918490682,
1986 -1.04913055217340124191,
1987 0.204446009836232697516
1988 #else
1989 #error
1990 #endif
1991 };
1992
1993
1994 /**
1995 * See http://www.devmaster.net/forums/showthread.php?p=43580
1996 */
1997 void
1998 lp_build_log2_approx(struct lp_build_context *bld,
1999 LLVMValueRef x,
2000 LLVMValueRef *p_exp,
2001 LLVMValueRef *p_floor_log2,
2002 LLVMValueRef *p_log2)
2003 {
2004 const struct lp_type type = bld->type;
2005 LLVMTypeRef vec_type = lp_build_vec_type(type);
2006 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2007
2008 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2009 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2010 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2011
2012 LLVMValueRef i = NULL;
2013 LLVMValueRef exp = NULL;
2014 LLVMValueRef mant = NULL;
2015 LLVMValueRef logexp = NULL;
2016 LLVMValueRef logmant = NULL;
2017 LLVMValueRef res = NULL;
2018
2019 assert(lp_check_value(bld->type, x));
2020
2021 if(p_exp || p_floor_log2 || p_log2) {
2022 /* TODO: optimize the constant case */
2023 if(LLVMIsConstant(x))
2024 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2025 __FUNCTION__);
2026
2027 assert(type.floating && type.width == 32);
2028
2029 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2030
2031 /* exp = (float) exponent(x) */
2032 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2033 }
2034
2035 if(p_floor_log2 || p_log2) {
2036 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2037 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2038 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2039 }
2040
2041 if(p_log2) {
2042 /* mant = (float) mantissa(x) */
2043 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2044 mant = LLVMBuildOr(bld->builder, mant, one, "");
2045 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2046
2047 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2048 Elements(lp_build_log2_polynomial));
2049
2050 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2051 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2052
2053 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2054 }
2055
2056 if(p_exp) {
2057 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2058 *p_exp = exp;
2059 }
2060
2061 if(p_floor_log2)
2062 *p_floor_log2 = logexp;
2063
2064 if(p_log2)
2065 *p_log2 = res;
2066 }
2067
2068
2069 LLVMValueRef
2070 lp_build_log2(struct lp_build_context *bld,
2071 LLVMValueRef x)
2072 {
2073 LLVMValueRef res;
2074 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2075 return res;
2076 }