gallivm: Refactor the Newton-Rapshon steps, and disable once again.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
60
61
62 /*
63 * XXX: Increasing eliminates some artifacts, but adds others, most
64 * noticeably corruption in the Earth halo in Google Earth.
65 */
66 #define RCP_NEWTON_STEPS 0
67
68 #define RSQRT_NEWTON_STEPS 0
69
70 #define EXP_POLY_DEGREE 3
71
72 #define LOG_POLY_DEGREE 5
73
74
75 /**
76 * Generate min(a, b)
77 * No checks for special case values of a or b = 1 or 0 are done.
78 */
79 static LLVMValueRef
80 lp_build_min_simple(struct lp_build_context *bld,
81 LLVMValueRef a,
82 LLVMValueRef b)
83 {
84 const struct lp_type type = bld->type;
85 const char *intrinsic = NULL;
86 LLVMValueRef cond;
87
88 assert(lp_check_value(type, a));
89 assert(lp_check_value(type, b));
90
91 /* TODO: optimize the constant case */
92
93 if(type.width * type.length == 128) {
94 if(type.floating) {
95 if(type.width == 32 && util_cpu_caps.has_sse)
96 intrinsic = "llvm.x86.sse.min.ps";
97 if(type.width == 64 && util_cpu_caps.has_sse2)
98 intrinsic = "llvm.x86.sse2.min.pd";
99 }
100 else {
101 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
102 intrinsic = "llvm.x86.sse2.pminu.b";
103 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
104 intrinsic = "llvm.x86.sse41.pminsb";
105 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
106 intrinsic = "llvm.x86.sse41.pminuw";
107 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
108 intrinsic = "llvm.x86.sse2.pmins.w";
109 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
110 intrinsic = "llvm.x86.sse41.pminud";
111 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
112 intrinsic = "llvm.x86.sse41.pminsd";
113 }
114 }
115
116 if(intrinsic)
117 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
118
119 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
120 return lp_build_select(bld, cond, a, b);
121 }
122
123
124 /**
125 * Generate max(a, b)
126 * No checks for special case values of a or b = 1 or 0 are done.
127 */
128 static LLVMValueRef
129 lp_build_max_simple(struct lp_build_context *bld,
130 LLVMValueRef a,
131 LLVMValueRef b)
132 {
133 const struct lp_type type = bld->type;
134 const char *intrinsic = NULL;
135 LLVMValueRef cond;
136
137 assert(lp_check_value(type, a));
138 assert(lp_check_value(type, b));
139
140 /* TODO: optimize the constant case */
141
142 if(type.width * type.length == 128) {
143 if(type.floating) {
144 if(type.width == 32 && util_cpu_caps.has_sse)
145 intrinsic = "llvm.x86.sse.max.ps";
146 if(type.width == 64 && util_cpu_caps.has_sse2)
147 intrinsic = "llvm.x86.sse2.max.pd";
148 }
149 else {
150 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
151 intrinsic = "llvm.x86.sse2.pmaxu.b";
152 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
153 intrinsic = "llvm.x86.sse41.pmaxsb";
154 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
155 intrinsic = "llvm.x86.sse41.pmaxuw";
156 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
157 intrinsic = "llvm.x86.sse2.pmaxs.w";
158 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
159 intrinsic = "llvm.x86.sse41.pmaxud";
160 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
161 intrinsic = "llvm.x86.sse41.pmaxsd";
162 }
163 }
164
165 if(intrinsic)
166 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
167
168 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
169 return lp_build_select(bld, cond, a, b);
170 }
171
172
173 /**
174 * Generate 1 - a, or ~a depending on bld->type.
175 */
176 LLVMValueRef
177 lp_build_comp(struct lp_build_context *bld,
178 LLVMValueRef a)
179 {
180 const struct lp_type type = bld->type;
181
182 assert(lp_check_value(type, a));
183
184 if(a == bld->one)
185 return bld->zero;
186 if(a == bld->zero)
187 return bld->one;
188
189 if(type.norm && !type.floating && !type.fixed && !type.sign) {
190 if(LLVMIsConstant(a))
191 return LLVMConstNot(a);
192 else
193 return LLVMBuildNot(bld->builder, a, "");
194 }
195
196 if(LLVMIsConstant(a))
197 if (type.floating)
198 return LLVMConstFSub(bld->one, a);
199 else
200 return LLVMConstSub(bld->one, a);
201 else
202 if (type.floating)
203 return LLVMBuildFSub(bld->builder, bld->one, a, "");
204 else
205 return LLVMBuildSub(bld->builder, bld->one, a, "");
206 }
207
208
209 /**
210 * Generate a + b
211 */
212 LLVMValueRef
213 lp_build_add(struct lp_build_context *bld,
214 LLVMValueRef a,
215 LLVMValueRef b)
216 {
217 const struct lp_type type = bld->type;
218 LLVMValueRef res;
219
220 assert(lp_check_value(type, a));
221 assert(lp_check_value(type, b));
222
223 if(a == bld->zero)
224 return b;
225 if(b == bld->zero)
226 return a;
227 if(a == bld->undef || b == bld->undef)
228 return bld->undef;
229
230 if(bld->type.norm) {
231 const char *intrinsic = NULL;
232
233 if(a == bld->one || b == bld->one)
234 return bld->one;
235
236 if(util_cpu_caps.has_sse2 &&
237 type.width * type.length == 128 &&
238 !type.floating && !type.fixed) {
239 if(type.width == 8)
240 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
241 if(type.width == 16)
242 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
243 }
244
245 if(intrinsic)
246 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
247 }
248
249 if(LLVMIsConstant(a) && LLVMIsConstant(b))
250 if (type.floating)
251 res = LLVMConstFAdd(a, b);
252 else
253 res = LLVMConstAdd(a, b);
254 else
255 if (type.floating)
256 res = LLVMBuildFAdd(bld->builder, a, b, "");
257 else
258 res = LLVMBuildAdd(bld->builder, a, b, "");
259
260 /* clamp to ceiling of 1.0 */
261 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
262 res = lp_build_min_simple(bld, res, bld->one);
263
264 /* XXX clamp to floor of -1 or 0??? */
265
266 return res;
267 }
268
269
270 /** Return the sum of the elements of a */
271 LLVMValueRef
272 lp_build_sum_vector(struct lp_build_context *bld,
273 LLVMValueRef a)
274 {
275 const struct lp_type type = bld->type;
276 LLVMValueRef index, res;
277 unsigned i;
278
279 assert(lp_check_value(type, a));
280
281 if (a == bld->zero)
282 return bld->zero;
283 if (a == bld->undef)
284 return bld->undef;
285 assert(type.length > 1);
286
287 assert(!bld->type.norm);
288
289 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
290 res = LLVMBuildExtractElement(bld->builder, a, index, "");
291
292 for (i = 1; i < type.length; i++) {
293 index = LLVMConstInt(LLVMInt32Type(), i, 0);
294 if (type.floating)
295 res = LLVMBuildFAdd(bld->builder, res,
296 LLVMBuildExtractElement(bld->builder,
297 a, index, ""),
298 "");
299 else
300 res = LLVMBuildAdd(bld->builder, res,
301 LLVMBuildExtractElement(bld->builder,
302 a, index, ""),
303 "");
304 }
305
306 return res;
307 }
308
309
310 /**
311 * Generate a - b
312 */
313 LLVMValueRef
314 lp_build_sub(struct lp_build_context *bld,
315 LLVMValueRef a,
316 LLVMValueRef b)
317 {
318 const struct lp_type type = bld->type;
319 LLVMValueRef res;
320
321 assert(lp_check_value(type, a));
322 assert(lp_check_value(type, b));
323
324 if(b == bld->zero)
325 return a;
326 if(a == bld->undef || b == bld->undef)
327 return bld->undef;
328 if(a == b)
329 return bld->zero;
330
331 if(bld->type.norm) {
332 const char *intrinsic = NULL;
333
334 if(b == bld->one)
335 return bld->zero;
336
337 if(util_cpu_caps.has_sse2 &&
338 type.width * type.length == 128 &&
339 !type.floating && !type.fixed) {
340 if(type.width == 8)
341 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
342 if(type.width == 16)
343 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
344 }
345
346 if(intrinsic)
347 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
348 }
349
350 if(LLVMIsConstant(a) && LLVMIsConstant(b))
351 if (type.floating)
352 res = LLVMConstFSub(a, b);
353 else
354 res = LLVMConstSub(a, b);
355 else
356 if (type.floating)
357 res = LLVMBuildFSub(bld->builder, a, b, "");
358 else
359 res = LLVMBuildSub(bld->builder, a, b, "");
360
361 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
362 res = lp_build_max_simple(bld, res, bld->zero);
363
364 return res;
365 }
366
367
368 /**
369 * Normalized 8bit multiplication.
370 *
371 * - alpha plus one
372 *
373 * makes the following approximation to the division (Sree)
374 *
375 * a*b/255 ~= (a*(b + 1)) >> 256
376 *
377 * which is the fastest method that satisfies the following OpenGL criteria
378 *
379 * 0*0 = 0 and 255*255 = 255
380 *
381 * - geometric series
382 *
383 * takes the geometric series approximation to the division
384 *
385 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
386 *
387 * in this case just the first two terms to fit in 16bit arithmetic
388 *
389 * t/255 ~= (t + (t >> 8)) >> 8
390 *
391 * note that just by itself it doesn't satisfies the OpenGL criteria, as
392 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
393 * must be used
394 *
395 * - geometric series plus rounding
396 *
397 * when using a geometric series division instead of truncating the result
398 * use roundoff in the approximation (Jim Blinn)
399 *
400 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
401 *
402 * achieving the exact results
403 *
404 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
405 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
406 * @sa Michael Herf, The "double blend trick", May 2000,
407 * http://www.stereopsis.com/doubleblend.html
408 */
409 static LLVMValueRef
410 lp_build_mul_u8n(LLVMBuilderRef builder,
411 struct lp_type i16_type,
412 LLVMValueRef a, LLVMValueRef b)
413 {
414 LLVMValueRef c8;
415 LLVMValueRef ab;
416
417 assert(!i16_type.floating);
418 assert(lp_check_value(i16_type, a));
419 assert(lp_check_value(i16_type, b));
420
421 c8 = lp_build_const_int_vec(i16_type, 8);
422
423 #if 0
424
425 /* a*b/255 ~= (a*(b + 1)) >> 256 */
426 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
427 ab = LLVMBuildMul(builder, a, b, "");
428
429 #else
430
431 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
432 ab = LLVMBuildMul(builder, a, b, "");
433 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
434 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
435
436 #endif
437
438 ab = LLVMBuildLShr(builder, ab, c8, "");
439
440 return ab;
441 }
442
443
444 /**
445 * Generate a * b
446 */
447 LLVMValueRef
448 lp_build_mul(struct lp_build_context *bld,
449 LLVMValueRef a,
450 LLVMValueRef b)
451 {
452 const struct lp_type type = bld->type;
453 LLVMValueRef shift;
454 LLVMValueRef res;
455
456 assert(lp_check_value(type, a));
457 assert(lp_check_value(type, b));
458
459 if(a == bld->zero)
460 return bld->zero;
461 if(a == bld->one)
462 return b;
463 if(b == bld->zero)
464 return bld->zero;
465 if(b == bld->one)
466 return a;
467 if(a == bld->undef || b == bld->undef)
468 return bld->undef;
469
470 if(!type.floating && !type.fixed && type.norm) {
471 if(type.width == 8) {
472 struct lp_type i16_type = lp_wider_type(type);
473 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
474
475 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
476 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
477
478 /* PMULLW, PSRLW, PADDW */
479 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
480 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
481
482 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
483
484 return ab;
485 }
486
487 /* FIXME */
488 assert(0);
489 }
490
491 if(type.fixed)
492 shift = lp_build_const_int_vec(type, type.width/2);
493 else
494 shift = NULL;
495
496 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
497 if (type.floating)
498 res = LLVMConstFMul(a, b);
499 else
500 res = LLVMConstMul(a, b);
501 if(shift) {
502 if(type.sign)
503 res = LLVMConstAShr(res, shift);
504 else
505 res = LLVMConstLShr(res, shift);
506 }
507 }
508 else {
509 if (type.floating)
510 res = LLVMBuildFMul(bld->builder, a, b, "");
511 else
512 res = LLVMBuildMul(bld->builder, a, b, "");
513 if(shift) {
514 if(type.sign)
515 res = LLVMBuildAShr(bld->builder, res, shift, "");
516 else
517 res = LLVMBuildLShr(bld->builder, res, shift, "");
518 }
519 }
520
521 return res;
522 }
523
524
525 /**
526 * Small vector x scale multiplication optimization.
527 */
528 LLVMValueRef
529 lp_build_mul_imm(struct lp_build_context *bld,
530 LLVMValueRef a,
531 int b)
532 {
533 LLVMValueRef factor;
534
535 assert(lp_check_value(bld->type, a));
536
537 if(b == 0)
538 return bld->zero;
539
540 if(b == 1)
541 return a;
542
543 if(b == -1)
544 return lp_build_negate(bld, a);
545
546 if(b == 2 && bld->type.floating)
547 return lp_build_add(bld, a, a);
548
549 if(util_is_pot(b)) {
550 unsigned shift = ffs(b) - 1;
551
552 if(bld->type.floating) {
553 #if 0
554 /*
555 * Power of two multiplication by directly manipulating the mantissa.
556 *
557 * XXX: This might not be always faster, it will introduce a small error
558 * for multiplication by zero, and it will produce wrong results
559 * for Inf and NaN.
560 */
561 unsigned mantissa = lp_mantissa(bld->type);
562 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
563 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
564 a = LLVMBuildAdd(bld->builder, a, factor, "");
565 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
566 return a;
567 #endif
568 }
569 else {
570 factor = lp_build_const_vec(bld->type, shift);
571 return LLVMBuildShl(bld->builder, a, factor, "");
572 }
573 }
574
575 factor = lp_build_const_vec(bld->type, (double)b);
576 return lp_build_mul(bld, a, factor);
577 }
578
579
580 /**
581 * Generate a / b
582 */
583 LLVMValueRef
584 lp_build_div(struct lp_build_context *bld,
585 LLVMValueRef a,
586 LLVMValueRef b)
587 {
588 const struct lp_type type = bld->type;
589
590 assert(lp_check_value(type, a));
591 assert(lp_check_value(type, b));
592
593 if(a == bld->zero)
594 return bld->zero;
595 if(a == bld->one)
596 return lp_build_rcp(bld, b);
597 if(b == bld->zero)
598 return bld->undef;
599 if(b == bld->one)
600 return a;
601 if(a == bld->undef || b == bld->undef)
602 return bld->undef;
603
604 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
605 if (type.floating)
606 return LLVMConstFDiv(a, b);
607 else if (type.sign)
608 return LLVMConstSDiv(a, b);
609 else
610 return LLVMConstUDiv(a, b);
611 }
612
613 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
614 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
615
616 if (type.floating)
617 return LLVMBuildFDiv(bld->builder, a, b, "");
618 else if (type.sign)
619 return LLVMBuildSDiv(bld->builder, a, b, "");
620 else
621 return LLVMBuildUDiv(bld->builder, a, b, "");
622 }
623
624
625 /**
626 * Linear interpolation.
627 *
628 * This also works for integer values with a few caveats.
629 *
630 * @sa http://www.stereopsis.com/doubleblend.html
631 */
632 LLVMValueRef
633 lp_build_lerp(struct lp_build_context *bld,
634 LLVMValueRef x,
635 LLVMValueRef v0,
636 LLVMValueRef v1)
637 {
638 LLVMValueRef delta;
639 LLVMValueRef res;
640
641 assert(lp_check_value(bld->type, x));
642 assert(lp_check_value(bld->type, v0));
643 assert(lp_check_value(bld->type, v1));
644
645 delta = lp_build_sub(bld, v1, v0);
646
647 res = lp_build_mul(bld, x, delta);
648
649 res = lp_build_add(bld, v0, res);
650
651 if(bld->type.fixed)
652 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
653 * but it will be wrong for other uses. Basically we need a more
654 * powerful lp_type, capable of further distinguishing the values
655 * interpretation from the value storage. */
656 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
657
658 return res;
659 }
660
661
662 LLVMValueRef
663 lp_build_lerp_2d(struct lp_build_context *bld,
664 LLVMValueRef x,
665 LLVMValueRef y,
666 LLVMValueRef v00,
667 LLVMValueRef v01,
668 LLVMValueRef v10,
669 LLVMValueRef v11)
670 {
671 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
672 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
673 return lp_build_lerp(bld, y, v0, v1);
674 }
675
676
677 /**
678 * Generate min(a, b)
679 * Do checks for special cases.
680 */
681 LLVMValueRef
682 lp_build_min(struct lp_build_context *bld,
683 LLVMValueRef a,
684 LLVMValueRef b)
685 {
686 assert(lp_check_value(bld->type, a));
687 assert(lp_check_value(bld->type, b));
688
689 if(a == bld->undef || b == bld->undef)
690 return bld->undef;
691
692 if(a == b)
693 return a;
694
695 if(bld->type.norm) {
696 if(a == bld->zero || b == bld->zero)
697 return bld->zero;
698 if(a == bld->one)
699 return b;
700 if(b == bld->one)
701 return a;
702 }
703
704 return lp_build_min_simple(bld, a, b);
705 }
706
707
708 /**
709 * Generate max(a, b)
710 * Do checks for special cases.
711 */
712 LLVMValueRef
713 lp_build_max(struct lp_build_context *bld,
714 LLVMValueRef a,
715 LLVMValueRef b)
716 {
717 assert(lp_check_value(bld->type, a));
718 assert(lp_check_value(bld->type, b));
719
720 if(a == bld->undef || b == bld->undef)
721 return bld->undef;
722
723 if(a == b)
724 return a;
725
726 if(bld->type.norm) {
727 if(a == bld->one || b == bld->one)
728 return bld->one;
729 if(a == bld->zero)
730 return b;
731 if(b == bld->zero)
732 return a;
733 }
734
735 return lp_build_max_simple(bld, a, b);
736 }
737
738
739 /**
740 * Generate clamp(a, min, max)
741 * Do checks for special cases.
742 */
743 LLVMValueRef
744 lp_build_clamp(struct lp_build_context *bld,
745 LLVMValueRef a,
746 LLVMValueRef min,
747 LLVMValueRef max)
748 {
749 assert(lp_check_value(bld->type, a));
750 assert(lp_check_value(bld->type, min));
751 assert(lp_check_value(bld->type, max));
752
753 a = lp_build_min(bld, a, max);
754 a = lp_build_max(bld, a, min);
755 return a;
756 }
757
758
759 /**
760 * Generate abs(a)
761 */
762 LLVMValueRef
763 lp_build_abs(struct lp_build_context *bld,
764 LLVMValueRef a)
765 {
766 const struct lp_type type = bld->type;
767 LLVMTypeRef vec_type = lp_build_vec_type(type);
768
769 assert(lp_check_value(type, a));
770
771 if(!type.sign)
772 return a;
773
774 if(type.floating) {
775 /* Mask out the sign bit */
776 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
777 unsigned long long absMask = ~(1ULL << (type.width - 1));
778 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
779 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
780 a = LLVMBuildAnd(bld->builder, a, mask, "");
781 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
782 return a;
783 }
784
785 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
786 switch(type.width) {
787 case 8:
788 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
789 case 16:
790 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
791 case 32:
792 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
793 }
794 }
795
796 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
797 }
798
799
800 LLVMValueRef
801 lp_build_negate(struct lp_build_context *bld,
802 LLVMValueRef a)
803 {
804 assert(lp_check_value(bld->type, a));
805
806 #if HAVE_LLVM >= 0x0207
807 if (bld->type.floating)
808 a = LLVMBuildFNeg(bld->builder, a, "");
809 else
810 #endif
811 a = LLVMBuildNeg(bld->builder, a, "");
812
813 return a;
814 }
815
816
817 /** Return -1, 0 or +1 depending on the sign of a */
818 LLVMValueRef
819 lp_build_sgn(struct lp_build_context *bld,
820 LLVMValueRef a)
821 {
822 const struct lp_type type = bld->type;
823 LLVMValueRef cond;
824 LLVMValueRef res;
825
826 assert(lp_check_value(type, a));
827
828 /* Handle non-zero case */
829 if(!type.sign) {
830 /* if not zero then sign must be positive */
831 res = bld->one;
832 }
833 else if(type.floating) {
834 LLVMTypeRef vec_type;
835 LLVMTypeRef int_type;
836 LLVMValueRef mask;
837 LLVMValueRef sign;
838 LLVMValueRef one;
839 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
840
841 int_type = lp_build_int_vec_type(type);
842 vec_type = lp_build_vec_type(type);
843 mask = lp_build_const_int_vec(type, maskBit);
844
845 /* Take the sign bit and add it to 1 constant */
846 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
847 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
848 one = LLVMConstBitCast(bld->one, int_type);
849 res = LLVMBuildOr(bld->builder, sign, one, "");
850 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
851 }
852 else
853 {
854 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
855 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
856 res = lp_build_select(bld, cond, bld->one, minus_one);
857 }
858
859 /* Handle zero */
860 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
861 res = lp_build_select(bld, cond, bld->zero, res);
862
863 return res;
864 }
865
866
867 /**
868 * Set the sign of float vector 'a' according to 'sign'.
869 * If sign==0, return abs(a).
870 * If sign==1, return -abs(a);
871 * Other values for sign produce undefined results.
872 */
873 LLVMValueRef
874 lp_build_set_sign(struct lp_build_context *bld,
875 LLVMValueRef a, LLVMValueRef sign)
876 {
877 const struct lp_type type = bld->type;
878 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
879 LLVMTypeRef vec_type = lp_build_vec_type(type);
880 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
881 LLVMValueRef mask = lp_build_const_int_vec(type,
882 ~((unsigned long long) 1 << (type.width - 1)));
883 LLVMValueRef val, res;
884
885 assert(type.floating);
886 assert(lp_check_value(type, a));
887
888 /* val = reinterpret_cast<int>(a) */
889 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
890 /* val = val & mask */
891 val = LLVMBuildAnd(bld->builder, val, mask, "");
892 /* sign = sign << shift */
893 sign = LLVMBuildShl(bld->builder, sign, shift, "");
894 /* res = val | sign */
895 res = LLVMBuildOr(bld->builder, val, sign, "");
896 /* res = reinterpret_cast<float>(res) */
897 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
898
899 return res;
900 }
901
902
903 /**
904 * Convert vector of (or scalar) int to vector of (or scalar) float.
905 */
906 LLVMValueRef
907 lp_build_int_to_float(struct lp_build_context *bld,
908 LLVMValueRef a)
909 {
910 const struct lp_type type = bld->type;
911 LLVMTypeRef vec_type = lp_build_vec_type(type);
912
913 assert(type.floating);
914
915 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
916 }
917
918
919
920 enum lp_build_round_sse41_mode
921 {
922 LP_BUILD_ROUND_SSE41_NEAREST = 0,
923 LP_BUILD_ROUND_SSE41_FLOOR = 1,
924 LP_BUILD_ROUND_SSE41_CEIL = 2,
925 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
926 };
927
928
929 static INLINE LLVMValueRef
930 lp_build_round_sse41(struct lp_build_context *bld,
931 LLVMValueRef a,
932 enum lp_build_round_sse41_mode mode)
933 {
934 const struct lp_type type = bld->type;
935 LLVMTypeRef vec_type = lp_build_vec_type(type);
936 const char *intrinsic;
937
938 assert(type.floating);
939 assert(type.width*type.length == 128);
940 assert(lp_check_value(type, a));
941 assert(util_cpu_caps.has_sse4_1);
942
943 switch(type.width) {
944 case 32:
945 intrinsic = "llvm.x86.sse41.round.ps";
946 break;
947 case 64:
948 intrinsic = "llvm.x86.sse41.round.pd";
949 break;
950 default:
951 assert(0);
952 return bld->undef;
953 }
954
955 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
956 LLVMConstInt(LLVMInt32Type(), mode, 0));
957 }
958
959
960 /**
961 * Return the integer part of a float (vector) value. The returned value is
962 * a float (vector).
963 * Ex: trunc(-1.5) = 1.0
964 */
965 LLVMValueRef
966 lp_build_trunc(struct lp_build_context *bld,
967 LLVMValueRef a)
968 {
969 const struct lp_type type = bld->type;
970
971 assert(type.floating);
972 assert(lp_check_value(type, a));
973
974 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
975 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
976 else {
977 LLVMTypeRef vec_type = lp_build_vec_type(type);
978 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
979 LLVMValueRef res;
980 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
981 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
982 return res;
983 }
984 }
985
986
987 /**
988 * Return float (vector) rounded to nearest integer (vector). The returned
989 * value is a float (vector).
990 * Ex: round(0.9) = 1.0
991 * Ex: round(-1.5) = -2.0
992 */
993 LLVMValueRef
994 lp_build_round(struct lp_build_context *bld,
995 LLVMValueRef a)
996 {
997 const struct lp_type type = bld->type;
998
999 assert(type.floating);
1000 assert(lp_check_value(type, a));
1001
1002 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1003 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1004 else {
1005 LLVMTypeRef vec_type = lp_build_vec_type(type);
1006 LLVMValueRef res;
1007 res = lp_build_iround(bld, a);
1008 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1009 return res;
1010 }
1011 }
1012
1013
1014 /**
1015 * Return floor of float (vector), result is a float (vector)
1016 * Ex: floor(1.1) = 1.0
1017 * Ex: floor(-1.1) = -2.0
1018 */
1019 LLVMValueRef
1020 lp_build_floor(struct lp_build_context *bld,
1021 LLVMValueRef a)
1022 {
1023 const struct lp_type type = bld->type;
1024
1025 assert(type.floating);
1026 assert(lp_check_value(type, a));
1027
1028 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1029 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1030 else {
1031 LLVMTypeRef vec_type = lp_build_vec_type(type);
1032 LLVMValueRef res;
1033 res = lp_build_ifloor(bld, a);
1034 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1035 return res;
1036 }
1037 }
1038
1039
1040 /**
1041 * Return ceiling of float (vector), returning float (vector).
1042 * Ex: ceil( 1.1) = 2.0
1043 * Ex: ceil(-1.1) = -1.0
1044 */
1045 LLVMValueRef
1046 lp_build_ceil(struct lp_build_context *bld,
1047 LLVMValueRef a)
1048 {
1049 const struct lp_type type = bld->type;
1050
1051 assert(type.floating);
1052 assert(lp_check_value(type, a));
1053
1054 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1055 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1056 else {
1057 LLVMTypeRef vec_type = lp_build_vec_type(type);
1058 LLVMValueRef res;
1059 res = lp_build_iceil(bld, a);
1060 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1061 return res;
1062 }
1063 }
1064
1065
1066 /**
1067 * Return fractional part of 'a' computed as a - floor(a)
1068 * Typically used in texture coord arithmetic.
1069 */
1070 LLVMValueRef
1071 lp_build_fract(struct lp_build_context *bld,
1072 LLVMValueRef a)
1073 {
1074 assert(bld->type.floating);
1075 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1076 }
1077
1078
1079 /**
1080 * Return the integer part of a float (vector) value. The returned value is
1081 * an integer (vector).
1082 * Ex: itrunc(-1.5) = 1
1083 */
1084 LLVMValueRef
1085 lp_build_itrunc(struct lp_build_context *bld,
1086 LLVMValueRef a)
1087 {
1088 const struct lp_type type = bld->type;
1089 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1090
1091 assert(type.floating);
1092 assert(lp_check_value(type, a));
1093
1094 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1095 }
1096
1097
1098 /**
1099 * Return float (vector) rounded to nearest integer (vector). The returned
1100 * value is an integer (vector).
1101 * Ex: iround(0.9) = 1
1102 * Ex: iround(-1.5) = -2
1103 */
1104 LLVMValueRef
1105 lp_build_iround(struct lp_build_context *bld,
1106 LLVMValueRef a)
1107 {
1108 const struct lp_type type = bld->type;
1109 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1110 LLVMValueRef res;
1111
1112 assert(type.floating);
1113
1114 assert(lp_check_value(type, a));
1115
1116 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1117 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1118 }
1119 else {
1120 LLVMTypeRef vec_type = lp_build_vec_type(type);
1121 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1122 LLVMValueRef sign;
1123 LLVMValueRef half;
1124
1125 /* get sign bit */
1126 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1127 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1128
1129 /* sign * 0.5 */
1130 half = lp_build_const_vec(type, 0.5);
1131 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1132 half = LLVMBuildOr(bld->builder, sign, half, "");
1133 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1134
1135 res = LLVMBuildFAdd(bld->builder, a, half, "");
1136 }
1137
1138 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1139
1140 return res;
1141 }
1142
1143
1144 /**
1145 * Return floor of float (vector), result is an int (vector)
1146 * Ex: ifloor(1.1) = 1.0
1147 * Ex: ifloor(-1.1) = -2.0
1148 */
1149 LLVMValueRef
1150 lp_build_ifloor(struct lp_build_context *bld,
1151 LLVMValueRef a)
1152 {
1153 const struct lp_type type = bld->type;
1154 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1155 LLVMValueRef res;
1156
1157 assert(type.floating);
1158 assert(lp_check_value(type, a));
1159
1160 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1161 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1162 }
1163 else {
1164 /* Take the sign bit and add it to 1 constant */
1165 LLVMTypeRef vec_type = lp_build_vec_type(type);
1166 unsigned mantissa = lp_mantissa(type);
1167 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1168 LLVMValueRef sign;
1169 LLVMValueRef offset;
1170
1171 /* sign = a < 0 ? ~0 : 0 */
1172 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1173 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1174 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1175
1176 /* offset = -0.99999(9)f */
1177 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1178 offset = LLVMConstBitCast(offset, int_vec_type);
1179
1180 /* offset = a < 0 ? offset : 0.0f */
1181 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1182 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1183
1184 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1185 }
1186
1187 /* round to nearest (toward zero) */
1188 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1189
1190 return res;
1191 }
1192
1193
1194 /**
1195 * Return ceiling of float (vector), returning int (vector).
1196 * Ex: iceil( 1.1) = 2
1197 * Ex: iceil(-1.1) = -1
1198 */
1199 LLVMValueRef
1200 lp_build_iceil(struct lp_build_context *bld,
1201 LLVMValueRef a)
1202 {
1203 const struct lp_type type = bld->type;
1204 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1205 LLVMValueRef res;
1206
1207 assert(type.floating);
1208 assert(lp_check_value(type, a));
1209
1210 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1211 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1212 }
1213 else {
1214 LLVMTypeRef vec_type = lp_build_vec_type(type);
1215 unsigned mantissa = lp_mantissa(type);
1216 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1217 LLVMValueRef sign;
1218 LLVMValueRef offset;
1219
1220 /* sign = a < 0 ? 0 : ~0 */
1221 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1222 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1223 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1224 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1225
1226 /* offset = 0.99999(9)f */
1227 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1228 offset = LLVMConstBitCast(offset, int_vec_type);
1229
1230 /* offset = a < 0 ? 0.0 : offset */
1231 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1232 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1233
1234 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1235 }
1236
1237 /* round to nearest (toward zero) */
1238 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1239
1240 return res;
1241 }
1242
1243
1244 LLVMValueRef
1245 lp_build_sqrt(struct lp_build_context *bld,
1246 LLVMValueRef a)
1247 {
1248 const struct lp_type type = bld->type;
1249 LLVMTypeRef vec_type = lp_build_vec_type(type);
1250 char intrinsic[32];
1251
1252 assert(lp_check_value(type, a));
1253
1254 /* TODO: optimize the constant case */
1255 /* TODO: optimize the constant case */
1256
1257 assert(type.floating);
1258 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1259
1260 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1261 }
1262
1263
1264 /**
1265 * Do one Newton-Raphson step to improve reciprocate precision:
1266 *
1267 * x_{i+1} = x_i * (2 - a * x_i)
1268 *
1269 * See also:
1270 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1271 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1272 */
1273 static INLINE LLVMValueRef
1274 lp_build_rcp_refine(struct lp_build_context *bld,
1275 LLVMValueRef a,
1276 LLVMValueRef rcp_a)
1277 {
1278 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1279 LLVMValueRef res;
1280
1281 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1282 res = LLVMBuildFSub(bld->builder, two, res, "");
1283 res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1284
1285 return res;
1286 }
1287
1288
1289 LLVMValueRef
1290 lp_build_rcp(struct lp_build_context *bld,
1291 LLVMValueRef a)
1292 {
1293 const struct lp_type type = bld->type;
1294
1295 assert(lp_check_value(type, a));
1296
1297 if(a == bld->zero)
1298 return bld->undef;
1299 if(a == bld->one)
1300 return bld->one;
1301 if(a == bld->undef)
1302 return bld->undef;
1303
1304 assert(type.floating);
1305
1306 if(LLVMIsConstant(a))
1307 return LLVMConstFDiv(bld->one, a);
1308
1309 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1310 LLVMValueRef res;
1311 unsigned i;
1312
1313 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1314
1315 for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
1316 res = lp_build_rcp_refine(bld, a, res);
1317 }
1318
1319 return res;
1320 }
1321
1322 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1323 }
1324
1325
1326 /**
1327 * Do one Newton-Raphson step to improve rsqrt precision:
1328 *
1329 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1330 *
1331 * See also:
1332 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1333 */
1334 static INLINE LLVMValueRef
1335 lp_build_rsqrt_refine(struct lp_build_context *bld,
1336 LLVMValueRef a,
1337 LLVMValueRef rsqrt_a)
1338 {
1339 LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1340 LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1341 LLVMValueRef res;
1342
1343 res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1344 res = LLVMBuildFMul(bld->builder, a, res, "");
1345 res = LLVMBuildFSub(bld->builder, three, res, "");
1346 res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1347 res = LLVMBuildFMul(bld->builder, half, res, "");
1348
1349 return res;
1350 }
1351
1352
1353 /**
1354 * Generate 1/sqrt(a)
1355 */
1356 LLVMValueRef
1357 lp_build_rsqrt(struct lp_build_context *bld,
1358 LLVMValueRef a)
1359 {
1360 const struct lp_type type = bld->type;
1361
1362 assert(lp_check_value(type, a));
1363
1364 assert(type.floating);
1365
1366 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1367 LLVMValueRef res;
1368 unsigned i;
1369
1370 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1371
1372 for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
1373 res = lp_build_rsqrt_refine(bld, a, res);
1374 }
1375
1376 return res;
1377 }
1378
1379 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1380 }
1381
1382
1383 static inline LLVMValueRef
1384 lp_build_const_v4si(unsigned long value)
1385 {
1386 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1387 LLVMValueRef elements[4] = { element, element, element, element };
1388 return LLVMConstVector(elements, 4);
1389 }
1390
1391 static inline LLVMValueRef
1392 lp_build_const_v4sf(float value)
1393 {
1394 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1395 LLVMValueRef elements[4] = { element, element, element, element };
1396 return LLVMConstVector(elements, 4);
1397 }
1398
1399
1400 /**
1401 * Generate sin(a) using SSE2
1402 */
1403 LLVMValueRef
1404 lp_build_sin(struct lp_build_context *bld,
1405 LLVMValueRef a)
1406 {
1407 struct lp_type int_type = lp_int_type(bld->type);
1408 LLVMBuilderRef b = bld->builder;
1409 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1410 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1411
1412 /*
1413 * take the absolute value,
1414 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1415 */
1416
1417 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1418 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1419
1420 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1421 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1422
1423 /*
1424 * extract the sign bit (upper one)
1425 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1426 */
1427 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1428 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1429
1430 /*
1431 * scale by 4/Pi
1432 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1433 */
1434
1435 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1436 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1437
1438 /*
1439 * store the integer part of y in mm0
1440 * emm2 = _mm_cvttps_epi32(y);
1441 */
1442
1443 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1444
1445 /*
1446 * j=(j+1) & (~1) (see the cephes sources)
1447 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1448 */
1449
1450 LLVMValueRef all_one = lp_build_const_v4si(1);
1451 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1452 /*
1453 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1454 */
1455 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1456 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1457
1458 /*
1459 * y = _mm_cvtepi32_ps(emm2);
1460 */
1461 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1462
1463 /* get the swap sign flag
1464 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1465 */
1466 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1467 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1468
1469 /*
1470 * emm2 = _mm_slli_epi32(emm0, 29);
1471 */
1472 LLVMValueRef const_29 = lp_build_const_v4si(29);
1473 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1474
1475 /*
1476 * get the polynom selection mask
1477 * there is one polynom for 0 <= x <= Pi/4
1478 * and another one for Pi/4<x<=Pi/2
1479 * Both branches will be computed.
1480 *
1481 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1482 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1483 */
1484
1485 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1486 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1487 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1488 emm2_3, lp_build_const_v4si(0));
1489 /*
1490 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1491 */
1492 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1493
1494 /*
1495 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1496 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1497 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1498 */
1499 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1500 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1501 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1502
1503 /*
1504 * The magic pass: "Extended precision modular arithmetic"
1505 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1506 * xmm1 = _mm_mul_ps(y, xmm1);
1507 * xmm2 = _mm_mul_ps(y, xmm2);
1508 * xmm3 = _mm_mul_ps(y, xmm3);
1509 */
1510 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1511 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1512 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1513
1514 /*
1515 * x = _mm_add_ps(x, xmm1);
1516 * x = _mm_add_ps(x, xmm2);
1517 * x = _mm_add_ps(x, xmm3);
1518 */
1519
1520 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1521 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1522 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1523
1524 /*
1525 * Evaluate the first polynom (0 <= x <= Pi/4)
1526 *
1527 * z = _mm_mul_ps(x,x);
1528 */
1529 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1530
1531 /*
1532 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1533 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1534 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1535 */
1536 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1537 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1538 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1539
1540 /*
1541 * y = *(v4sf*)_ps_coscof_p0;
1542 * y = _mm_mul_ps(y, z);
1543 */
1544 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1545 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1546 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1547 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1548 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1549 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1550
1551
1552 /*
1553 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1554 * y = _mm_sub_ps(y, tmp);
1555 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1556 */
1557 LLVMValueRef half = lp_build_const_v4sf(0.5);
1558 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1559 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1560 LLVMValueRef one = lp_build_const_v4sf(1.0);
1561 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1562
1563 /*
1564 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1565 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1566 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1567 */
1568 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1569 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1570 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1571
1572 /*
1573 * Evaluate the second polynom (Pi/4 <= x <= 0)
1574 *
1575 * y2 = *(v4sf*)_ps_sincof_p0;
1576 * y2 = _mm_mul_ps(y2, z);
1577 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1578 * y2 = _mm_mul_ps(y2, z);
1579 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1580 * y2 = _mm_mul_ps(y2, z);
1581 * y2 = _mm_mul_ps(y2, x);
1582 * y2 = _mm_add_ps(y2, x);
1583 */
1584
1585 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1586 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1587 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1588 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1589 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1590 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1591 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1592
1593 /*
1594 * select the correct result from the two polynoms
1595 * xmm3 = poly_mask;
1596 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1597 * y = _mm_andnot_ps(xmm3, y);
1598 * y = _mm_add_ps(y,y2);
1599 */
1600 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1601 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1602 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1603 LLVMValueRef inv = lp_build_const_v4si(~0);
1604 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1605 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1606 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1607
1608 /*
1609 * update the sign
1610 * y = _mm_xor_ps(y, sign_bit);
1611 */
1612 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1613 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1614 return y_result;
1615 }
1616
1617
1618 /**
1619 * Generate cos(a) using SSE2
1620 */
1621 LLVMValueRef
1622 lp_build_cos(struct lp_build_context *bld,
1623 LLVMValueRef a)
1624 {
1625 struct lp_type int_type = lp_int_type(bld->type);
1626 LLVMBuilderRef b = bld->builder;
1627 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1628 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1629
1630 /*
1631 * take the absolute value,
1632 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1633 */
1634
1635 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1636 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1637
1638 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1639 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1640
1641 /*
1642 * scale by 4/Pi
1643 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1644 */
1645
1646 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1647 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1648
1649 /*
1650 * store the integer part of y in mm0
1651 * emm2 = _mm_cvttps_epi32(y);
1652 */
1653
1654 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1655
1656 /*
1657 * j=(j+1) & (~1) (see the cephes sources)
1658 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1659 */
1660
1661 LLVMValueRef all_one = lp_build_const_v4si(1);
1662 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1663 /*
1664 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1665 */
1666 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1667 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1668
1669 /*
1670 * y = _mm_cvtepi32_ps(emm2);
1671 */
1672 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1673
1674
1675 /*
1676 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1677 */
1678 LLVMValueRef const_2 = lp_build_const_v4si(2);
1679 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1680
1681
1682 /* get the swap sign flag
1683 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1684 */
1685 LLVMValueRef inv = lp_build_const_v4si(~0);
1686 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1687 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1688 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1689
1690 /*
1691 * emm2 = _mm_slli_epi32(emm0, 29);
1692 */
1693 LLVMValueRef const_29 = lp_build_const_v4si(29);
1694 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1695
1696 /*
1697 * get the polynom selection mask
1698 * there is one polynom for 0 <= x <= Pi/4
1699 * and another one for Pi/4<x<=Pi/2
1700 * Both branches will be computed.
1701 *
1702 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1703 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1704 */
1705
1706 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1707 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1708 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1709 emm2_3, lp_build_const_v4si(0));
1710
1711 /*
1712 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1713 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1714 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1715 */
1716 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1717 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1718 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1719
1720 /*
1721 * The magic pass: "Extended precision modular arithmetic"
1722 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1723 * xmm1 = _mm_mul_ps(y, xmm1);
1724 * xmm2 = _mm_mul_ps(y, xmm2);
1725 * xmm3 = _mm_mul_ps(y, xmm3);
1726 */
1727 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1728 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1729 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1730
1731 /*
1732 * x = _mm_add_ps(x, xmm1);
1733 * x = _mm_add_ps(x, xmm2);
1734 * x = _mm_add_ps(x, xmm3);
1735 */
1736
1737 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1738 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1739 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1740
1741 /*
1742 * Evaluate the first polynom (0 <= x <= Pi/4)
1743 *
1744 * z = _mm_mul_ps(x,x);
1745 */
1746 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1747
1748 /*
1749 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1750 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1751 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1752 */
1753 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1754 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1755 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1756
1757 /*
1758 * y = *(v4sf*)_ps_coscof_p0;
1759 * y = _mm_mul_ps(y, z);
1760 */
1761 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1762 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1763 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1764 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1765 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1766 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1767
1768
1769 /*
1770 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1771 * y = _mm_sub_ps(y, tmp);
1772 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1773 */
1774 LLVMValueRef half = lp_build_const_v4sf(0.5);
1775 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1776 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1777 LLVMValueRef one = lp_build_const_v4sf(1.0);
1778 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1779
1780 /*
1781 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1782 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1783 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1784 */
1785 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1786 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1787 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1788
1789 /*
1790 * Evaluate the second polynom (Pi/4 <= x <= 0)
1791 *
1792 * y2 = *(v4sf*)_ps_sincof_p0;
1793 * y2 = _mm_mul_ps(y2, z);
1794 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1795 * y2 = _mm_mul_ps(y2, z);
1796 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1797 * y2 = _mm_mul_ps(y2, z);
1798 * y2 = _mm_mul_ps(y2, x);
1799 * y2 = _mm_add_ps(y2, x);
1800 */
1801
1802 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1803 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1804 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1805 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1806 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1807 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1808 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1809
1810 /*
1811 * select the correct result from the two polynoms
1812 * xmm3 = poly_mask;
1813 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1814 * y = _mm_andnot_ps(xmm3, y);
1815 * y = _mm_add_ps(y,y2);
1816 */
1817 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1818 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1819 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1820 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1821 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1822 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1823
1824 /*
1825 * update the sign
1826 * y = _mm_xor_ps(y, sign_bit);
1827 */
1828 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1829 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1830 return y_result;
1831 }
1832
1833
1834 /**
1835 * Generate pow(x, y)
1836 */
1837 LLVMValueRef
1838 lp_build_pow(struct lp_build_context *bld,
1839 LLVMValueRef x,
1840 LLVMValueRef y)
1841 {
1842 /* TODO: optimize the constant case */
1843 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1844 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1845 __FUNCTION__);
1846
1847 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1848 }
1849
1850
1851 /**
1852 * Generate exp(x)
1853 */
1854 LLVMValueRef
1855 lp_build_exp(struct lp_build_context *bld,
1856 LLVMValueRef x)
1857 {
1858 /* log2(e) = 1/log(2) */
1859 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1860
1861 assert(lp_check_value(bld->type, x));
1862
1863 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1864 }
1865
1866
1867 /**
1868 * Generate log(x)
1869 */
1870 LLVMValueRef
1871 lp_build_log(struct lp_build_context *bld,
1872 LLVMValueRef x)
1873 {
1874 /* log(2) */
1875 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1876
1877 assert(lp_check_value(bld->type, x));
1878
1879 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1880 }
1881
1882
1883 /**
1884 * Generate polynomial.
1885 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1886 */
1887 static LLVMValueRef
1888 lp_build_polynomial(struct lp_build_context *bld,
1889 LLVMValueRef x,
1890 const double *coeffs,
1891 unsigned num_coeffs)
1892 {
1893 const struct lp_type type = bld->type;
1894 LLVMValueRef res = NULL;
1895 unsigned i;
1896
1897 assert(lp_check_value(bld->type, x));
1898
1899 /* TODO: optimize the constant case */
1900 if(LLVMIsConstant(x))
1901 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1902 __FUNCTION__);
1903
1904 for (i = num_coeffs; i--; ) {
1905 LLVMValueRef coeff;
1906
1907 coeff = lp_build_const_vec(type, coeffs[i]);
1908
1909 if(res)
1910 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1911 else
1912 res = coeff;
1913 }
1914
1915 if(res)
1916 return res;
1917 else
1918 return bld->undef;
1919 }
1920
1921
1922 /**
1923 * Minimax polynomial fit of 2**x, in range [0, 1[
1924 */
1925 const double lp_build_exp2_polynomial[] = {
1926 #if EXP_POLY_DEGREE == 5
1927 0.999999999690134838155,
1928 0.583974334321735217258,
1929 0.164553105719676828492,
1930 0.0292811063701710962255,
1931 0.00354944426657875141846,
1932 0.000296253726543423377365
1933 #elif EXP_POLY_DEGREE == 4
1934 1.00000001502262084505,
1935 0.563586057338685991394,
1936 0.150436017652442413623,
1937 0.0243220604213317927308,
1938 0.0025359088446580436489
1939 #elif EXP_POLY_DEGREE == 3
1940 0.999925218562710312959,
1941 0.695833540494823811697,
1942 0.226067155427249155588,
1943 0.0780245226406372992967
1944 #elif EXP_POLY_DEGREE == 2
1945 1.00172476321474503578,
1946 0.657636275736077639316,
1947 0.33718943461968720704
1948 #else
1949 #error
1950 #endif
1951 };
1952
1953
1954 void
1955 lp_build_exp2_approx(struct lp_build_context *bld,
1956 LLVMValueRef x,
1957 LLVMValueRef *p_exp2_int_part,
1958 LLVMValueRef *p_frac_part,
1959 LLVMValueRef *p_exp2)
1960 {
1961 const struct lp_type type = bld->type;
1962 LLVMTypeRef vec_type = lp_build_vec_type(type);
1963 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1964 LLVMValueRef ipart = NULL;
1965 LLVMValueRef fpart = NULL;
1966 LLVMValueRef expipart = NULL;
1967 LLVMValueRef expfpart = NULL;
1968 LLVMValueRef res = NULL;
1969
1970 assert(lp_check_value(bld->type, x));
1971
1972 if(p_exp2_int_part || p_frac_part || p_exp2) {
1973 /* TODO: optimize the constant case */
1974 if(LLVMIsConstant(x))
1975 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1976 __FUNCTION__);
1977
1978 assert(type.floating && type.width == 32);
1979
1980 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1981 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1982
1983 /* ipart = floor(x) */
1984 ipart = lp_build_floor(bld, x);
1985
1986 /* fpart = x - ipart */
1987 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1988 }
1989
1990 if(p_exp2_int_part || p_exp2) {
1991 /* expipart = (float) (1 << ipart) */
1992 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1993 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1994 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1995 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1996 }
1997
1998 if(p_exp2) {
1999 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2000 Elements(lp_build_exp2_polynomial));
2001
2002 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2003 }
2004
2005 if(p_exp2_int_part)
2006 *p_exp2_int_part = expipart;
2007
2008 if(p_frac_part)
2009 *p_frac_part = fpart;
2010
2011 if(p_exp2)
2012 *p_exp2 = res;
2013 }
2014
2015
2016 LLVMValueRef
2017 lp_build_exp2(struct lp_build_context *bld,
2018 LLVMValueRef x)
2019 {
2020 LLVMValueRef res;
2021 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2022 return res;
2023 }
2024
2025
2026 /**
2027 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2028 * These coefficients can be generate with
2029 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2030 */
2031 const double lp_build_log2_polynomial[] = {
2032 #if LOG_POLY_DEGREE == 6
2033 3.11578814719469302614,
2034 -3.32419399085241980044,
2035 2.59883907202499966007,
2036 -1.23152682416275988241,
2037 0.318212422185251071475,
2038 -0.0344359067839062357313
2039 #elif LOG_POLY_DEGREE == 5
2040 2.8882704548164776201,
2041 -2.52074962577807006663,
2042 1.48116647521213171641,
2043 -0.465725644288844778798,
2044 0.0596515482674574969533
2045 #elif LOG_POLY_DEGREE == 4
2046 2.61761038894603480148,
2047 -1.75647175389045657003,
2048 0.688243882994381274313,
2049 -0.107254423828329604454
2050 #elif LOG_POLY_DEGREE == 3
2051 2.28330284476918490682,
2052 -1.04913055217340124191,
2053 0.204446009836232697516
2054 #else
2055 #error
2056 #endif
2057 };
2058
2059
2060 /**
2061 * See http://www.devmaster.net/forums/showthread.php?p=43580
2062 */
2063 void
2064 lp_build_log2_approx(struct lp_build_context *bld,
2065 LLVMValueRef x,
2066 LLVMValueRef *p_exp,
2067 LLVMValueRef *p_floor_log2,
2068 LLVMValueRef *p_log2)
2069 {
2070 const struct lp_type type = bld->type;
2071 LLVMTypeRef vec_type = lp_build_vec_type(type);
2072 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2073
2074 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2075 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2076 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2077
2078 LLVMValueRef i = NULL;
2079 LLVMValueRef exp = NULL;
2080 LLVMValueRef mant = NULL;
2081 LLVMValueRef logexp = NULL;
2082 LLVMValueRef logmant = NULL;
2083 LLVMValueRef res = NULL;
2084
2085 assert(lp_check_value(bld->type, x));
2086
2087 if(p_exp || p_floor_log2 || p_log2) {
2088 /* TODO: optimize the constant case */
2089 if(LLVMIsConstant(x))
2090 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2091 __FUNCTION__);
2092
2093 assert(type.floating && type.width == 32);
2094
2095 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2096
2097 /* exp = (float) exponent(x) */
2098 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2099 }
2100
2101 if(p_floor_log2 || p_log2) {
2102 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2103 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2104 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2105 }
2106
2107 if(p_log2) {
2108 /* mant = (float) mantissa(x) */
2109 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2110 mant = LLVMBuildOr(bld->builder, mant, one, "");
2111 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2112
2113 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2114 Elements(lp_build_log2_polynomial));
2115
2116 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2117 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2118
2119 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2120 }
2121
2122 if(p_exp) {
2123 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2124 *p_exp = exp;
2125 }
2126
2127 if(p_floor_log2)
2128 *p_floor_log2 = logexp;
2129
2130 if(p_log2)
2131 *p_log2 = res;
2132 }
2133
2134
2135 LLVMValueRef
2136 lp_build_log2(struct lp_build_context *bld,
2137 LLVMValueRef x)
2138 {
2139 LLVMValueRef res;
2140 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2141 return res;
2142 }