Merge remote branch 'origin/nv50-compiler'
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_debug.h"
60 #include "lp_bld_arit.h"
61
62
63 #define EXP_POLY_DEGREE 3
64
65 #define LOG_POLY_DEGREE 5
66
67
68 /**
69 * Generate min(a, b)
70 * No checks for special case values of a or b = 1 or 0 are done.
71 */
72 static LLVMValueRef
73 lp_build_min_simple(struct lp_build_context *bld,
74 LLVMValueRef a,
75 LLVMValueRef b)
76 {
77 const struct lp_type type = bld->type;
78 const char *intrinsic = NULL;
79 LLVMValueRef cond;
80
81 assert(lp_check_value(type, a));
82 assert(lp_check_value(type, b));
83
84 /* TODO: optimize the constant case */
85
86 if(type.width * type.length == 128) {
87 if(type.floating) {
88 if(type.width == 32 && util_cpu_caps.has_sse)
89 intrinsic = "llvm.x86.sse.min.ps";
90 if(type.width == 64 && util_cpu_caps.has_sse2)
91 intrinsic = "llvm.x86.sse2.min.pd";
92 }
93 else {
94 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
95 intrinsic = "llvm.x86.sse2.pminu.b";
96 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
97 intrinsic = "llvm.x86.sse41.pminsb";
98 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminuw";
100 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
101 intrinsic = "llvm.x86.sse2.pmins.w";
102 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
103 intrinsic = "llvm.x86.sse41.pminud";
104 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
105 intrinsic = "llvm.x86.sse41.pminsd";
106 }
107 }
108
109 if(intrinsic)
110 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
111
112 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
113 return lp_build_select(bld, cond, a, b);
114 }
115
116
117 /**
118 * Generate max(a, b)
119 * No checks for special case values of a or b = 1 or 0 are done.
120 */
121 static LLVMValueRef
122 lp_build_max_simple(struct lp_build_context *bld,
123 LLVMValueRef a,
124 LLVMValueRef b)
125 {
126 const struct lp_type type = bld->type;
127 const char *intrinsic = NULL;
128 LLVMValueRef cond;
129
130 assert(lp_check_value(type, a));
131 assert(lp_check_value(type, b));
132
133 /* TODO: optimize the constant case */
134
135 if(type.width * type.length == 128) {
136 if(type.floating) {
137 if(type.width == 32 && util_cpu_caps.has_sse)
138 intrinsic = "llvm.x86.sse.max.ps";
139 if(type.width == 64 && util_cpu_caps.has_sse2)
140 intrinsic = "llvm.x86.sse2.max.pd";
141 }
142 else {
143 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
144 intrinsic = "llvm.x86.sse2.pmaxu.b";
145 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
146 intrinsic = "llvm.x86.sse41.pmaxsb";
147 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
148 intrinsic = "llvm.x86.sse41.pmaxuw";
149 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
150 intrinsic = "llvm.x86.sse2.pmaxs.w";
151 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
152 intrinsic = "llvm.x86.sse41.pmaxud";
153 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
154 intrinsic = "llvm.x86.sse41.pmaxsd";
155 }
156 }
157
158 if(intrinsic)
159 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
160
161 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
162 return lp_build_select(bld, cond, a, b);
163 }
164
165
166 /**
167 * Generate 1 - a, or ~a depending on bld->type.
168 */
169 LLVMValueRef
170 lp_build_comp(struct lp_build_context *bld,
171 LLVMValueRef a)
172 {
173 const struct lp_type type = bld->type;
174
175 assert(lp_check_value(type, a));
176
177 if(a == bld->one)
178 return bld->zero;
179 if(a == bld->zero)
180 return bld->one;
181
182 if(type.norm && !type.floating && !type.fixed && !type.sign) {
183 if(LLVMIsConstant(a))
184 return LLVMConstNot(a);
185 else
186 return LLVMBuildNot(bld->builder, a, "");
187 }
188
189 if(LLVMIsConstant(a))
190 if (type.floating)
191 return LLVMConstFSub(bld->one, a);
192 else
193 return LLVMConstSub(bld->one, a);
194 else
195 if (type.floating)
196 return LLVMBuildFSub(bld->builder, bld->one, a, "");
197 else
198 return LLVMBuildSub(bld->builder, bld->one, a, "");
199 }
200
201
202 /**
203 * Generate a + b
204 */
205 LLVMValueRef
206 lp_build_add(struct lp_build_context *bld,
207 LLVMValueRef a,
208 LLVMValueRef b)
209 {
210 const struct lp_type type = bld->type;
211 LLVMValueRef res;
212
213 assert(lp_check_value(type, a));
214 assert(lp_check_value(type, b));
215
216 if(a == bld->zero)
217 return b;
218 if(b == bld->zero)
219 return a;
220 if(a == bld->undef || b == bld->undef)
221 return bld->undef;
222
223 if(bld->type.norm) {
224 const char *intrinsic = NULL;
225
226 if(a == bld->one || b == bld->one)
227 return bld->one;
228
229 if(util_cpu_caps.has_sse2 &&
230 type.width * type.length == 128 &&
231 !type.floating && !type.fixed) {
232 if(type.width == 8)
233 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
234 if(type.width == 16)
235 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
236 }
237
238 if(intrinsic)
239 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
240 }
241
242 if(LLVMIsConstant(a) && LLVMIsConstant(b))
243 if (type.floating)
244 res = LLVMConstFAdd(a, b);
245 else
246 res = LLVMConstAdd(a, b);
247 else
248 if (type.floating)
249 res = LLVMBuildFAdd(bld->builder, a, b, "");
250 else
251 res = LLVMBuildAdd(bld->builder, a, b, "");
252
253 /* clamp to ceiling of 1.0 */
254 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
255 res = lp_build_min_simple(bld, res, bld->one);
256
257 /* XXX clamp to floor of -1 or 0??? */
258
259 return res;
260 }
261
262
263 /** Return the scalar sum of the elements of a */
264 LLVMValueRef
265 lp_build_sum_vector(struct lp_build_context *bld,
266 LLVMValueRef a)
267 {
268 const struct lp_type type = bld->type;
269 LLVMValueRef index, res;
270 unsigned i;
271
272 assert(lp_check_value(type, a));
273
274 if (type.length == 1) {
275 return a;
276 }
277
278 assert(!bld->type.norm);
279
280 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
281 res = LLVMBuildExtractElement(bld->builder, a, index, "");
282
283 for (i = 1; i < type.length; i++) {
284 index = LLVMConstInt(LLVMInt32Type(), i, 0);
285 if (type.floating)
286 res = LLVMBuildFAdd(bld->builder, res,
287 LLVMBuildExtractElement(bld->builder,
288 a, index, ""),
289 "");
290 else
291 res = LLVMBuildAdd(bld->builder, res,
292 LLVMBuildExtractElement(bld->builder,
293 a, index, ""),
294 "");
295 }
296
297 return res;
298 }
299
300
301 /**
302 * Generate a - b
303 */
304 LLVMValueRef
305 lp_build_sub(struct lp_build_context *bld,
306 LLVMValueRef a,
307 LLVMValueRef b)
308 {
309 const struct lp_type type = bld->type;
310 LLVMValueRef res;
311
312 assert(lp_check_value(type, a));
313 assert(lp_check_value(type, b));
314
315 if(b == bld->zero)
316 return a;
317 if(a == bld->undef || b == bld->undef)
318 return bld->undef;
319 if(a == b)
320 return bld->zero;
321
322 if(bld->type.norm) {
323 const char *intrinsic = NULL;
324
325 if(b == bld->one)
326 return bld->zero;
327
328 if(util_cpu_caps.has_sse2 &&
329 type.width * type.length == 128 &&
330 !type.floating && !type.fixed) {
331 if(type.width == 8)
332 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
333 if(type.width == 16)
334 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
335 }
336
337 if(intrinsic)
338 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
339 }
340
341 if(LLVMIsConstant(a) && LLVMIsConstant(b))
342 if (type.floating)
343 res = LLVMConstFSub(a, b);
344 else
345 res = LLVMConstSub(a, b);
346 else
347 if (type.floating)
348 res = LLVMBuildFSub(bld->builder, a, b, "");
349 else
350 res = LLVMBuildSub(bld->builder, a, b, "");
351
352 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
353 res = lp_build_max_simple(bld, res, bld->zero);
354
355 return res;
356 }
357
358
359 /**
360 * Normalized 8bit multiplication.
361 *
362 * - alpha plus one
363 *
364 * makes the following approximation to the division (Sree)
365 *
366 * a*b/255 ~= (a*(b + 1)) >> 256
367 *
368 * which is the fastest method that satisfies the following OpenGL criteria
369 *
370 * 0*0 = 0 and 255*255 = 255
371 *
372 * - geometric series
373 *
374 * takes the geometric series approximation to the division
375 *
376 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
377 *
378 * in this case just the first two terms to fit in 16bit arithmetic
379 *
380 * t/255 ~= (t + (t >> 8)) >> 8
381 *
382 * note that just by itself it doesn't satisfies the OpenGL criteria, as
383 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
384 * must be used
385 *
386 * - geometric series plus rounding
387 *
388 * when using a geometric series division instead of truncating the result
389 * use roundoff in the approximation (Jim Blinn)
390 *
391 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
392 *
393 * achieving the exact results
394 *
395 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
396 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
397 * @sa Michael Herf, The "double blend trick", May 2000,
398 * http://www.stereopsis.com/doubleblend.html
399 */
400 static LLVMValueRef
401 lp_build_mul_u8n(LLVMBuilderRef builder,
402 struct lp_type i16_type,
403 LLVMValueRef a, LLVMValueRef b)
404 {
405 LLVMValueRef c8;
406 LLVMValueRef ab;
407
408 assert(!i16_type.floating);
409 assert(lp_check_value(i16_type, a));
410 assert(lp_check_value(i16_type, b));
411
412 c8 = lp_build_const_int_vec(i16_type, 8);
413
414 #if 0
415
416 /* a*b/255 ~= (a*(b + 1)) >> 256 */
417 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
418 ab = LLVMBuildMul(builder, a, b, "");
419
420 #else
421
422 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
423 ab = LLVMBuildMul(builder, a, b, "");
424 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
425 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
426
427 #endif
428
429 ab = LLVMBuildLShr(builder, ab, c8, "");
430
431 return ab;
432 }
433
434
435 /**
436 * Generate a * b
437 */
438 LLVMValueRef
439 lp_build_mul(struct lp_build_context *bld,
440 LLVMValueRef a,
441 LLVMValueRef b)
442 {
443 const struct lp_type type = bld->type;
444 LLVMValueRef shift;
445 LLVMValueRef res;
446
447 assert(lp_check_value(type, a));
448 assert(lp_check_value(type, b));
449
450 if(a == bld->zero)
451 return bld->zero;
452 if(a == bld->one)
453 return b;
454 if(b == bld->zero)
455 return bld->zero;
456 if(b == bld->one)
457 return a;
458 if(a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if(!type.floating && !type.fixed && type.norm) {
462 if(type.width == 8) {
463 struct lp_type i16_type = lp_wider_type(type);
464 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
465
466 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
467 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
468
469 /* PMULLW, PSRLW, PADDW */
470 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
471 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
472
473 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
474
475 return ab;
476 }
477
478 /* FIXME */
479 assert(0);
480 }
481
482 if(type.fixed)
483 shift = lp_build_const_int_vec(type, type.width/2);
484 else
485 shift = NULL;
486
487 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
488 if (type.floating)
489 res = LLVMConstFMul(a, b);
490 else
491 res = LLVMConstMul(a, b);
492 if(shift) {
493 if(type.sign)
494 res = LLVMConstAShr(res, shift);
495 else
496 res = LLVMConstLShr(res, shift);
497 }
498 }
499 else {
500 if (type.floating)
501 res = LLVMBuildFMul(bld->builder, a, b, "");
502 else
503 res = LLVMBuildMul(bld->builder, a, b, "");
504 if(shift) {
505 if(type.sign)
506 res = LLVMBuildAShr(bld->builder, res, shift, "");
507 else
508 res = LLVMBuildLShr(bld->builder, res, shift, "");
509 }
510 }
511
512 return res;
513 }
514
515
516 /**
517 * Small vector x scale multiplication optimization.
518 */
519 LLVMValueRef
520 lp_build_mul_imm(struct lp_build_context *bld,
521 LLVMValueRef a,
522 int b)
523 {
524 LLVMValueRef factor;
525
526 assert(lp_check_value(bld->type, a));
527
528 if(b == 0)
529 return bld->zero;
530
531 if(b == 1)
532 return a;
533
534 if(b == -1)
535 return lp_build_negate(bld, a);
536
537 if(b == 2 && bld->type.floating)
538 return lp_build_add(bld, a, a);
539
540 if(util_is_power_of_two(b)) {
541 unsigned shift = ffs(b) - 1;
542
543 if(bld->type.floating) {
544 #if 0
545 /*
546 * Power of two multiplication by directly manipulating the mantissa.
547 *
548 * XXX: This might not be always faster, it will introduce a small error
549 * for multiplication by zero, and it will produce wrong results
550 * for Inf and NaN.
551 */
552 unsigned mantissa = lp_mantissa(bld->type);
553 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
554 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
555 a = LLVMBuildAdd(bld->builder, a, factor, "");
556 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
557 return a;
558 #endif
559 }
560 else {
561 factor = lp_build_const_vec(bld->type, shift);
562 return LLVMBuildShl(bld->builder, a, factor, "");
563 }
564 }
565
566 factor = lp_build_const_vec(bld->type, (double)b);
567 return lp_build_mul(bld, a, factor);
568 }
569
570
571 /**
572 * Generate a / b
573 */
574 LLVMValueRef
575 lp_build_div(struct lp_build_context *bld,
576 LLVMValueRef a,
577 LLVMValueRef b)
578 {
579 const struct lp_type type = bld->type;
580
581 assert(lp_check_value(type, a));
582 assert(lp_check_value(type, b));
583
584 if(a == bld->zero)
585 return bld->zero;
586 if(a == bld->one)
587 return lp_build_rcp(bld, b);
588 if(b == bld->zero)
589 return bld->undef;
590 if(b == bld->one)
591 return a;
592 if(a == bld->undef || b == bld->undef)
593 return bld->undef;
594
595 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
596 if (type.floating)
597 return LLVMConstFDiv(a, b);
598 else if (type.sign)
599 return LLVMConstSDiv(a, b);
600 else
601 return LLVMConstUDiv(a, b);
602 }
603
604 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
605 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
606
607 if (type.floating)
608 return LLVMBuildFDiv(bld->builder, a, b, "");
609 else if (type.sign)
610 return LLVMBuildSDiv(bld->builder, a, b, "");
611 else
612 return LLVMBuildUDiv(bld->builder, a, b, "");
613 }
614
615
616 /**
617 * Linear interpolation.
618 *
619 * This also works for integer values with a few caveats.
620 *
621 * @sa http://www.stereopsis.com/doubleblend.html
622 */
623 LLVMValueRef
624 lp_build_lerp(struct lp_build_context *bld,
625 LLVMValueRef x,
626 LLVMValueRef v0,
627 LLVMValueRef v1)
628 {
629 LLVMValueRef delta;
630 LLVMValueRef res;
631
632 assert(lp_check_value(bld->type, x));
633 assert(lp_check_value(bld->type, v0));
634 assert(lp_check_value(bld->type, v1));
635
636 delta = lp_build_sub(bld, v1, v0);
637
638 res = lp_build_mul(bld, x, delta);
639
640 res = lp_build_add(bld, v0, res);
641
642 if(bld->type.fixed)
643 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
644 * but it will be wrong for other uses. Basically we need a more
645 * powerful lp_type, capable of further distinguishing the values
646 * interpretation from the value storage. */
647 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
648
649 return res;
650 }
651
652
653 LLVMValueRef
654 lp_build_lerp_2d(struct lp_build_context *bld,
655 LLVMValueRef x,
656 LLVMValueRef y,
657 LLVMValueRef v00,
658 LLVMValueRef v01,
659 LLVMValueRef v10,
660 LLVMValueRef v11)
661 {
662 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
663 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
664 return lp_build_lerp(bld, y, v0, v1);
665 }
666
667
668 /**
669 * Generate min(a, b)
670 * Do checks for special cases.
671 */
672 LLVMValueRef
673 lp_build_min(struct lp_build_context *bld,
674 LLVMValueRef a,
675 LLVMValueRef b)
676 {
677 assert(lp_check_value(bld->type, a));
678 assert(lp_check_value(bld->type, b));
679
680 if(a == bld->undef || b == bld->undef)
681 return bld->undef;
682
683 if(a == b)
684 return a;
685
686 if(bld->type.norm) {
687 if(a == bld->zero || b == bld->zero)
688 return bld->zero;
689 if(a == bld->one)
690 return b;
691 if(b == bld->one)
692 return a;
693 }
694
695 return lp_build_min_simple(bld, a, b);
696 }
697
698
699 /**
700 * Generate max(a, b)
701 * Do checks for special cases.
702 */
703 LLVMValueRef
704 lp_build_max(struct lp_build_context *bld,
705 LLVMValueRef a,
706 LLVMValueRef b)
707 {
708 assert(lp_check_value(bld->type, a));
709 assert(lp_check_value(bld->type, b));
710
711 if(a == bld->undef || b == bld->undef)
712 return bld->undef;
713
714 if(a == b)
715 return a;
716
717 if(bld->type.norm) {
718 if(a == bld->one || b == bld->one)
719 return bld->one;
720 if(a == bld->zero)
721 return b;
722 if(b == bld->zero)
723 return a;
724 }
725
726 return lp_build_max_simple(bld, a, b);
727 }
728
729
730 /**
731 * Generate clamp(a, min, max)
732 * Do checks for special cases.
733 */
734 LLVMValueRef
735 lp_build_clamp(struct lp_build_context *bld,
736 LLVMValueRef a,
737 LLVMValueRef min,
738 LLVMValueRef max)
739 {
740 assert(lp_check_value(bld->type, a));
741 assert(lp_check_value(bld->type, min));
742 assert(lp_check_value(bld->type, max));
743
744 a = lp_build_min(bld, a, max);
745 a = lp_build_max(bld, a, min);
746 return a;
747 }
748
749
750 /**
751 * Generate abs(a)
752 */
753 LLVMValueRef
754 lp_build_abs(struct lp_build_context *bld,
755 LLVMValueRef a)
756 {
757 const struct lp_type type = bld->type;
758 LLVMTypeRef vec_type = lp_build_vec_type(type);
759
760 assert(lp_check_value(type, a));
761
762 if(!type.sign)
763 return a;
764
765 if(type.floating) {
766 /* Mask out the sign bit */
767 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
768 unsigned long long absMask = ~(1ULL << (type.width - 1));
769 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
770 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
771 a = LLVMBuildAnd(bld->builder, a, mask, "");
772 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
773 return a;
774 }
775
776 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
777 switch(type.width) {
778 case 8:
779 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
780 case 16:
781 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
782 case 32:
783 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
784 }
785 }
786
787 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
788 }
789
790
791 LLVMValueRef
792 lp_build_negate(struct lp_build_context *bld,
793 LLVMValueRef a)
794 {
795 assert(lp_check_value(bld->type, a));
796
797 #if HAVE_LLVM >= 0x0207
798 if (bld->type.floating)
799 a = LLVMBuildFNeg(bld->builder, a, "");
800 else
801 #endif
802 a = LLVMBuildNeg(bld->builder, a, "");
803
804 return a;
805 }
806
807
808 /** Return -1, 0 or +1 depending on the sign of a */
809 LLVMValueRef
810 lp_build_sgn(struct lp_build_context *bld,
811 LLVMValueRef a)
812 {
813 const struct lp_type type = bld->type;
814 LLVMValueRef cond;
815 LLVMValueRef res;
816
817 assert(lp_check_value(type, a));
818
819 /* Handle non-zero case */
820 if(!type.sign) {
821 /* if not zero then sign must be positive */
822 res = bld->one;
823 }
824 else if(type.floating) {
825 LLVMTypeRef vec_type;
826 LLVMTypeRef int_type;
827 LLVMValueRef mask;
828 LLVMValueRef sign;
829 LLVMValueRef one;
830 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
831
832 int_type = lp_build_int_vec_type(type);
833 vec_type = lp_build_vec_type(type);
834 mask = lp_build_const_int_vec(type, maskBit);
835
836 /* Take the sign bit and add it to 1 constant */
837 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
838 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
839 one = LLVMConstBitCast(bld->one, int_type);
840 res = LLVMBuildOr(bld->builder, sign, one, "");
841 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
842 }
843 else
844 {
845 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
846 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
847 res = lp_build_select(bld, cond, bld->one, minus_one);
848 }
849
850 /* Handle zero */
851 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
852 res = lp_build_select(bld, cond, bld->zero, res);
853
854 return res;
855 }
856
857
858 /**
859 * Set the sign of float vector 'a' according to 'sign'.
860 * If sign==0, return abs(a).
861 * If sign==1, return -abs(a);
862 * Other values for sign produce undefined results.
863 */
864 LLVMValueRef
865 lp_build_set_sign(struct lp_build_context *bld,
866 LLVMValueRef a, LLVMValueRef sign)
867 {
868 const struct lp_type type = bld->type;
869 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
870 LLVMTypeRef vec_type = lp_build_vec_type(type);
871 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
872 LLVMValueRef mask = lp_build_const_int_vec(type,
873 ~((unsigned long long) 1 << (type.width - 1)));
874 LLVMValueRef val, res;
875
876 assert(type.floating);
877 assert(lp_check_value(type, a));
878
879 /* val = reinterpret_cast<int>(a) */
880 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
881 /* val = val & mask */
882 val = LLVMBuildAnd(bld->builder, val, mask, "");
883 /* sign = sign << shift */
884 sign = LLVMBuildShl(bld->builder, sign, shift, "");
885 /* res = val | sign */
886 res = LLVMBuildOr(bld->builder, val, sign, "");
887 /* res = reinterpret_cast<float>(res) */
888 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
889
890 return res;
891 }
892
893
894 /**
895 * Convert vector of (or scalar) int to vector of (or scalar) float.
896 */
897 LLVMValueRef
898 lp_build_int_to_float(struct lp_build_context *bld,
899 LLVMValueRef a)
900 {
901 const struct lp_type type = bld->type;
902 LLVMTypeRef vec_type = lp_build_vec_type(type);
903
904 assert(type.floating);
905
906 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
907 }
908
909
910
911 enum lp_build_round_sse41_mode
912 {
913 LP_BUILD_ROUND_SSE41_NEAREST = 0,
914 LP_BUILD_ROUND_SSE41_FLOOR = 1,
915 LP_BUILD_ROUND_SSE41_CEIL = 2,
916 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
917 };
918
919
920 static INLINE LLVMValueRef
921 lp_build_round_sse41(struct lp_build_context *bld,
922 LLVMValueRef a,
923 enum lp_build_round_sse41_mode mode)
924 {
925 const struct lp_type type = bld->type;
926 LLVMTypeRef vec_type = lp_build_vec_type(type);
927 const char *intrinsic;
928
929 assert(type.floating);
930 assert(type.width*type.length == 128);
931 assert(lp_check_value(type, a));
932 assert(util_cpu_caps.has_sse4_1);
933
934 switch(type.width) {
935 case 32:
936 intrinsic = "llvm.x86.sse41.round.ps";
937 break;
938 case 64:
939 intrinsic = "llvm.x86.sse41.round.pd";
940 break;
941 default:
942 assert(0);
943 return bld->undef;
944 }
945
946 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
947 LLVMConstInt(LLVMInt32Type(), mode, 0));
948 }
949
950
951 /**
952 * Return the integer part of a float (vector) value. The returned value is
953 * a float (vector).
954 * Ex: trunc(-1.5) = 1.0
955 */
956 LLVMValueRef
957 lp_build_trunc(struct lp_build_context *bld,
958 LLVMValueRef a)
959 {
960 const struct lp_type type = bld->type;
961
962 assert(type.floating);
963 assert(lp_check_value(type, a));
964
965 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
966 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
967 else {
968 LLVMTypeRef vec_type = lp_build_vec_type(type);
969 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
970 LLVMValueRef res;
971 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
972 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
973 return res;
974 }
975 }
976
977
978 /**
979 * Return float (vector) rounded to nearest integer (vector). The returned
980 * value is a float (vector).
981 * Ex: round(0.9) = 1.0
982 * Ex: round(-1.5) = -2.0
983 */
984 LLVMValueRef
985 lp_build_round(struct lp_build_context *bld,
986 LLVMValueRef a)
987 {
988 const struct lp_type type = bld->type;
989
990 assert(type.floating);
991 assert(lp_check_value(type, a));
992
993 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
994 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
995 else {
996 LLVMTypeRef vec_type = lp_build_vec_type(type);
997 LLVMValueRef res;
998 res = lp_build_iround(bld, a);
999 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1000 return res;
1001 }
1002 }
1003
1004
1005 /**
1006 * Return floor of float (vector), result is a float (vector)
1007 * Ex: floor(1.1) = 1.0
1008 * Ex: floor(-1.1) = -2.0
1009 */
1010 LLVMValueRef
1011 lp_build_floor(struct lp_build_context *bld,
1012 LLVMValueRef a)
1013 {
1014 const struct lp_type type = bld->type;
1015
1016 assert(type.floating);
1017 assert(lp_check_value(type, a));
1018
1019 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1020 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1021 else {
1022 LLVMTypeRef vec_type = lp_build_vec_type(type);
1023 LLVMValueRef res;
1024 res = lp_build_ifloor(bld, a);
1025 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1026 return res;
1027 }
1028 }
1029
1030
1031 /**
1032 * Return ceiling of float (vector), returning float (vector).
1033 * Ex: ceil( 1.1) = 2.0
1034 * Ex: ceil(-1.1) = -1.0
1035 */
1036 LLVMValueRef
1037 lp_build_ceil(struct lp_build_context *bld,
1038 LLVMValueRef a)
1039 {
1040 const struct lp_type type = bld->type;
1041
1042 assert(type.floating);
1043 assert(lp_check_value(type, a));
1044
1045 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1046 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1047 else {
1048 LLVMTypeRef vec_type = lp_build_vec_type(type);
1049 LLVMValueRef res;
1050 res = lp_build_iceil(bld, a);
1051 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1052 return res;
1053 }
1054 }
1055
1056
1057 /**
1058 * Return fractional part of 'a' computed as a - floor(a)
1059 * Typically used in texture coord arithmetic.
1060 */
1061 LLVMValueRef
1062 lp_build_fract(struct lp_build_context *bld,
1063 LLVMValueRef a)
1064 {
1065 assert(bld->type.floating);
1066 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1067 }
1068
1069
1070 /**
1071 * Return the integer part of a float (vector) value. The returned value is
1072 * an integer (vector).
1073 * Ex: itrunc(-1.5) = 1
1074 */
1075 LLVMValueRef
1076 lp_build_itrunc(struct lp_build_context *bld,
1077 LLVMValueRef a)
1078 {
1079 const struct lp_type type = bld->type;
1080 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1081
1082 assert(type.floating);
1083 assert(lp_check_value(type, a));
1084
1085 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1086 }
1087
1088
1089 /**
1090 * Return float (vector) rounded to nearest integer (vector). The returned
1091 * value is an integer (vector).
1092 * Ex: iround(0.9) = 1
1093 * Ex: iround(-1.5) = -2
1094 */
1095 LLVMValueRef
1096 lp_build_iround(struct lp_build_context *bld,
1097 LLVMValueRef a)
1098 {
1099 const struct lp_type type = bld->type;
1100 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1101 LLVMValueRef res;
1102
1103 assert(type.floating);
1104
1105 assert(lp_check_value(type, a));
1106
1107 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1108 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1109 }
1110 else {
1111 LLVMTypeRef vec_type = lp_build_vec_type(type);
1112 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1113 LLVMValueRef sign;
1114 LLVMValueRef half;
1115
1116 /* get sign bit */
1117 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1118 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1119
1120 /* sign * 0.5 */
1121 half = lp_build_const_vec(type, 0.5);
1122 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1123 half = LLVMBuildOr(bld->builder, sign, half, "");
1124 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1125
1126 res = LLVMBuildFAdd(bld->builder, a, half, "");
1127 }
1128
1129 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1130
1131 return res;
1132 }
1133
1134
1135 /**
1136 * Return floor of float (vector), result is an int (vector)
1137 * Ex: ifloor(1.1) = 1.0
1138 * Ex: ifloor(-1.1) = -2.0
1139 */
1140 LLVMValueRef
1141 lp_build_ifloor(struct lp_build_context *bld,
1142 LLVMValueRef a)
1143 {
1144 const struct lp_type type = bld->type;
1145 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1146 LLVMValueRef res;
1147
1148 assert(type.floating);
1149 assert(lp_check_value(type, a));
1150
1151 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1152 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1153 }
1154 else {
1155 /* Take the sign bit and add it to 1 constant */
1156 LLVMTypeRef vec_type = lp_build_vec_type(type);
1157 unsigned mantissa = lp_mantissa(type);
1158 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1159 LLVMValueRef sign;
1160 LLVMValueRef offset;
1161
1162 /* sign = a < 0 ? ~0 : 0 */
1163 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1164 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1165 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1166
1167 /* offset = -0.99999(9)f */
1168 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1169 offset = LLVMConstBitCast(offset, int_vec_type);
1170
1171 /* offset = a < 0 ? offset : 0.0f */
1172 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1173 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1174
1175 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1176 }
1177
1178 /* round to nearest (toward zero) */
1179 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1180
1181 return res;
1182 }
1183
1184
1185 /**
1186 * Return ceiling of float (vector), returning int (vector).
1187 * Ex: iceil( 1.1) = 2
1188 * Ex: iceil(-1.1) = -1
1189 */
1190 LLVMValueRef
1191 lp_build_iceil(struct lp_build_context *bld,
1192 LLVMValueRef a)
1193 {
1194 const struct lp_type type = bld->type;
1195 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1196 LLVMValueRef res;
1197
1198 assert(type.floating);
1199 assert(lp_check_value(type, a));
1200
1201 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1202 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1203 }
1204 else {
1205 LLVMTypeRef vec_type = lp_build_vec_type(type);
1206 unsigned mantissa = lp_mantissa(type);
1207 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1208 LLVMValueRef sign;
1209 LLVMValueRef offset;
1210
1211 /* sign = a < 0 ? 0 : ~0 */
1212 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1213 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1214 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1215 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1216
1217 /* offset = 0.99999(9)f */
1218 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1219 offset = LLVMConstBitCast(offset, int_vec_type);
1220
1221 /* offset = a < 0 ? 0.0 : offset */
1222 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1223 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1224
1225 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1226 }
1227
1228 /* round to nearest (toward zero) */
1229 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1230
1231 return res;
1232 }
1233
1234
1235 LLVMValueRef
1236 lp_build_sqrt(struct lp_build_context *bld,
1237 LLVMValueRef a)
1238 {
1239 const struct lp_type type = bld->type;
1240 LLVMTypeRef vec_type = lp_build_vec_type(type);
1241 char intrinsic[32];
1242
1243 assert(lp_check_value(type, a));
1244
1245 /* TODO: optimize the constant case */
1246 /* TODO: optimize the constant case */
1247
1248 assert(type.floating);
1249 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1250
1251 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1252 }
1253
1254
1255 /**
1256 * Do one Newton-Raphson step to improve reciprocate precision:
1257 *
1258 * x_{i+1} = x_i * (2 - a * x_i)
1259 *
1260 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1261 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1262 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1263 * halo. It would be necessary to clamp the argument to prevent this.
1264 *
1265 * See also:
1266 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1267 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1268 */
1269 static INLINE LLVMValueRef
1270 lp_build_rcp_refine(struct lp_build_context *bld,
1271 LLVMValueRef a,
1272 LLVMValueRef rcp_a)
1273 {
1274 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1275 LLVMValueRef res;
1276
1277 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1278 res = LLVMBuildFSub(bld->builder, two, res, "");
1279 res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1280
1281 return res;
1282 }
1283
1284
1285 LLVMValueRef
1286 lp_build_rcp(struct lp_build_context *bld,
1287 LLVMValueRef a)
1288 {
1289 const struct lp_type type = bld->type;
1290
1291 assert(lp_check_value(type, a));
1292
1293 if(a == bld->zero)
1294 return bld->undef;
1295 if(a == bld->one)
1296 return bld->one;
1297 if(a == bld->undef)
1298 return bld->undef;
1299
1300 assert(type.floating);
1301
1302 if(LLVMIsConstant(a))
1303 return LLVMConstFDiv(bld->one, a);
1304
1305 /*
1306 * We don't use RCPPS because:
1307 * - it only has 10bits of precision
1308 * - it doesn't even get the reciprocate of 1.0 exactly
1309 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1310 * - for recent processors the benefit over DIVPS is marginal, a case
1311 * depedent
1312 *
1313 * We could still use it on certain processors if benchmarks show that the
1314 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1315 * particular uses that require less workarounds.
1316 */
1317
1318 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1319 const unsigned num_iterations = 0;
1320 LLVMValueRef res;
1321 unsigned i;
1322
1323 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1324
1325 for (i = 0; i < num_iterations; ++i) {
1326 res = lp_build_rcp_refine(bld, a, res);
1327 }
1328
1329 return res;
1330 }
1331
1332 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1333 }
1334
1335
1336 /**
1337 * Do one Newton-Raphson step to improve rsqrt precision:
1338 *
1339 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1340 *
1341 * See also:
1342 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1343 */
1344 static INLINE LLVMValueRef
1345 lp_build_rsqrt_refine(struct lp_build_context *bld,
1346 LLVMValueRef a,
1347 LLVMValueRef rsqrt_a)
1348 {
1349 LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1350 LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1351 LLVMValueRef res;
1352
1353 res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1354 res = LLVMBuildFMul(bld->builder, a, res, "");
1355 res = LLVMBuildFSub(bld->builder, three, res, "");
1356 res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1357 res = LLVMBuildFMul(bld->builder, half, res, "");
1358
1359 return res;
1360 }
1361
1362
1363 /**
1364 * Generate 1/sqrt(a)
1365 */
1366 LLVMValueRef
1367 lp_build_rsqrt(struct lp_build_context *bld,
1368 LLVMValueRef a)
1369 {
1370 const struct lp_type type = bld->type;
1371
1372 assert(lp_check_value(type, a));
1373
1374 assert(type.floating);
1375
1376 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1377 const unsigned num_iterations = 0;
1378 LLVMValueRef res;
1379 unsigned i;
1380
1381 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1382
1383 for (i = 0; i < num_iterations; ++i) {
1384 res = lp_build_rsqrt_refine(bld, a, res);
1385 }
1386
1387 return res;
1388 }
1389
1390 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1391 }
1392
1393
1394 static inline LLVMValueRef
1395 lp_build_const_v4si(unsigned long value)
1396 {
1397 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1398 LLVMValueRef elements[4] = { element, element, element, element };
1399 return LLVMConstVector(elements, 4);
1400 }
1401
1402 static inline LLVMValueRef
1403 lp_build_const_v4sf(float value)
1404 {
1405 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1406 LLVMValueRef elements[4] = { element, element, element, element };
1407 return LLVMConstVector(elements, 4);
1408 }
1409
1410
1411 /**
1412 * Generate sin(a) using SSE2
1413 */
1414 LLVMValueRef
1415 lp_build_sin(struct lp_build_context *bld,
1416 LLVMValueRef a)
1417 {
1418 struct lp_type int_type = lp_int_type(bld->type);
1419 LLVMBuilderRef b = bld->builder;
1420 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1421 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1422
1423 /*
1424 * take the absolute value,
1425 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1426 */
1427
1428 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1429 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1430
1431 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1432 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1433
1434 /*
1435 * extract the sign bit (upper one)
1436 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1437 */
1438 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1439 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1440
1441 /*
1442 * scale by 4/Pi
1443 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1444 */
1445
1446 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1447 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1448
1449 /*
1450 * store the integer part of y in mm0
1451 * emm2 = _mm_cvttps_epi32(y);
1452 */
1453
1454 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1455
1456 /*
1457 * j=(j+1) & (~1) (see the cephes sources)
1458 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1459 */
1460
1461 LLVMValueRef all_one = lp_build_const_v4si(1);
1462 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1463 /*
1464 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1465 */
1466 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1467 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1468
1469 /*
1470 * y = _mm_cvtepi32_ps(emm2);
1471 */
1472 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1473
1474 /* get the swap sign flag
1475 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1476 */
1477 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1478 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1479
1480 /*
1481 * emm2 = _mm_slli_epi32(emm0, 29);
1482 */
1483 LLVMValueRef const_29 = lp_build_const_v4si(29);
1484 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1485
1486 /*
1487 * get the polynom selection mask
1488 * there is one polynom for 0 <= x <= Pi/4
1489 * and another one for Pi/4<x<=Pi/2
1490 * Both branches will be computed.
1491 *
1492 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1493 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1494 */
1495
1496 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1497 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1498 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1499 emm2_3, lp_build_const_v4si(0));
1500 /*
1501 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1502 */
1503 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1504
1505 /*
1506 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1507 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1508 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1509 */
1510 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1511 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1512 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1513
1514 /*
1515 * The magic pass: "Extended precision modular arithmetic"
1516 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1517 * xmm1 = _mm_mul_ps(y, xmm1);
1518 * xmm2 = _mm_mul_ps(y, xmm2);
1519 * xmm3 = _mm_mul_ps(y, xmm3);
1520 */
1521 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1522 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1523 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1524
1525 /*
1526 * x = _mm_add_ps(x, xmm1);
1527 * x = _mm_add_ps(x, xmm2);
1528 * x = _mm_add_ps(x, xmm3);
1529 */
1530
1531 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1532 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1533 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1534
1535 /*
1536 * Evaluate the first polynom (0 <= x <= Pi/4)
1537 *
1538 * z = _mm_mul_ps(x,x);
1539 */
1540 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1541
1542 /*
1543 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1544 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1545 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1546 */
1547 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1548 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1549 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1550
1551 /*
1552 * y = *(v4sf*)_ps_coscof_p0;
1553 * y = _mm_mul_ps(y, z);
1554 */
1555 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1556 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1557 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1558 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1559 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1560 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1561
1562
1563 /*
1564 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1565 * y = _mm_sub_ps(y, tmp);
1566 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1567 */
1568 LLVMValueRef half = lp_build_const_v4sf(0.5);
1569 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1570 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1571 LLVMValueRef one = lp_build_const_v4sf(1.0);
1572 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1573
1574 /*
1575 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1576 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1577 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1578 */
1579 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1580 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1581 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1582
1583 /*
1584 * Evaluate the second polynom (Pi/4 <= x <= 0)
1585 *
1586 * y2 = *(v4sf*)_ps_sincof_p0;
1587 * y2 = _mm_mul_ps(y2, z);
1588 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1589 * y2 = _mm_mul_ps(y2, z);
1590 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1591 * y2 = _mm_mul_ps(y2, z);
1592 * y2 = _mm_mul_ps(y2, x);
1593 * y2 = _mm_add_ps(y2, x);
1594 */
1595
1596 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1597 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1598 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1599 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1600 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1601 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1602 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1603
1604 /*
1605 * select the correct result from the two polynoms
1606 * xmm3 = poly_mask;
1607 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1608 * y = _mm_andnot_ps(xmm3, y);
1609 * y = _mm_add_ps(y,y2);
1610 */
1611 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1612 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1613 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1614 LLVMValueRef inv = lp_build_const_v4si(~0);
1615 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1616 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1617 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1618
1619 /*
1620 * update the sign
1621 * y = _mm_xor_ps(y, sign_bit);
1622 */
1623 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1624 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1625 return y_result;
1626 }
1627
1628
1629 /**
1630 * Generate cos(a) using SSE2
1631 */
1632 LLVMValueRef
1633 lp_build_cos(struct lp_build_context *bld,
1634 LLVMValueRef a)
1635 {
1636 struct lp_type int_type = lp_int_type(bld->type);
1637 LLVMBuilderRef b = bld->builder;
1638 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1639 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1640
1641 /*
1642 * take the absolute value,
1643 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1644 */
1645
1646 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1647 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1648
1649 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1650 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1651
1652 /*
1653 * scale by 4/Pi
1654 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1655 */
1656
1657 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1658 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1659
1660 /*
1661 * store the integer part of y in mm0
1662 * emm2 = _mm_cvttps_epi32(y);
1663 */
1664
1665 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1666
1667 /*
1668 * j=(j+1) & (~1) (see the cephes sources)
1669 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1670 */
1671
1672 LLVMValueRef all_one = lp_build_const_v4si(1);
1673 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1674 /*
1675 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1676 */
1677 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1678 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1679
1680 /*
1681 * y = _mm_cvtepi32_ps(emm2);
1682 */
1683 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1684
1685
1686 /*
1687 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1688 */
1689 LLVMValueRef const_2 = lp_build_const_v4si(2);
1690 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1691
1692
1693 /* get the swap sign flag
1694 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1695 */
1696 LLVMValueRef inv = lp_build_const_v4si(~0);
1697 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1698 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1699 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1700
1701 /*
1702 * emm2 = _mm_slli_epi32(emm0, 29);
1703 */
1704 LLVMValueRef const_29 = lp_build_const_v4si(29);
1705 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1706
1707 /*
1708 * get the polynom selection mask
1709 * there is one polynom for 0 <= x <= Pi/4
1710 * and another one for Pi/4<x<=Pi/2
1711 * Both branches will be computed.
1712 *
1713 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1714 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1715 */
1716
1717 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1718 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1719 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1720 emm2_3, lp_build_const_v4si(0));
1721
1722 /*
1723 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1724 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1725 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1726 */
1727 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1728 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1729 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1730
1731 /*
1732 * The magic pass: "Extended precision modular arithmetic"
1733 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1734 * xmm1 = _mm_mul_ps(y, xmm1);
1735 * xmm2 = _mm_mul_ps(y, xmm2);
1736 * xmm3 = _mm_mul_ps(y, xmm3);
1737 */
1738 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1739 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1740 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1741
1742 /*
1743 * x = _mm_add_ps(x, xmm1);
1744 * x = _mm_add_ps(x, xmm2);
1745 * x = _mm_add_ps(x, xmm3);
1746 */
1747
1748 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1749 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1750 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1751
1752 /*
1753 * Evaluate the first polynom (0 <= x <= Pi/4)
1754 *
1755 * z = _mm_mul_ps(x,x);
1756 */
1757 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1758
1759 /*
1760 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1761 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1762 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1763 */
1764 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1765 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1766 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1767
1768 /*
1769 * y = *(v4sf*)_ps_coscof_p0;
1770 * y = _mm_mul_ps(y, z);
1771 */
1772 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1773 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1774 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1775 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1776 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1777 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1778
1779
1780 /*
1781 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1782 * y = _mm_sub_ps(y, tmp);
1783 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1784 */
1785 LLVMValueRef half = lp_build_const_v4sf(0.5);
1786 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1787 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1788 LLVMValueRef one = lp_build_const_v4sf(1.0);
1789 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1790
1791 /*
1792 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1793 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1794 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1795 */
1796 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1797 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1798 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1799
1800 /*
1801 * Evaluate the second polynom (Pi/4 <= x <= 0)
1802 *
1803 * y2 = *(v4sf*)_ps_sincof_p0;
1804 * y2 = _mm_mul_ps(y2, z);
1805 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1806 * y2 = _mm_mul_ps(y2, z);
1807 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1808 * y2 = _mm_mul_ps(y2, z);
1809 * y2 = _mm_mul_ps(y2, x);
1810 * y2 = _mm_add_ps(y2, x);
1811 */
1812
1813 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1814 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1815 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1816 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1817 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1818 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1819 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1820
1821 /*
1822 * select the correct result from the two polynoms
1823 * xmm3 = poly_mask;
1824 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1825 * y = _mm_andnot_ps(xmm3, y);
1826 * y = _mm_add_ps(y,y2);
1827 */
1828 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1829 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1830 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1831 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1832 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1833 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1834
1835 /*
1836 * update the sign
1837 * y = _mm_xor_ps(y, sign_bit);
1838 */
1839 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1840 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1841 return y_result;
1842 }
1843
1844
1845 /**
1846 * Generate pow(x, y)
1847 */
1848 LLVMValueRef
1849 lp_build_pow(struct lp_build_context *bld,
1850 LLVMValueRef x,
1851 LLVMValueRef y)
1852 {
1853 /* TODO: optimize the constant case */
1854 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
1855 LLVMIsConstant(x) && LLVMIsConstant(y)) {
1856 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1857 __FUNCTION__);
1858 }
1859
1860 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1861 }
1862
1863
1864 /**
1865 * Generate exp(x)
1866 */
1867 LLVMValueRef
1868 lp_build_exp(struct lp_build_context *bld,
1869 LLVMValueRef x)
1870 {
1871 /* log2(e) = 1/log(2) */
1872 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1873
1874 assert(lp_check_value(bld->type, x));
1875
1876 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1877 }
1878
1879
1880 /**
1881 * Generate log(x)
1882 */
1883 LLVMValueRef
1884 lp_build_log(struct lp_build_context *bld,
1885 LLVMValueRef x)
1886 {
1887 /* log(2) */
1888 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1889
1890 assert(lp_check_value(bld->type, x));
1891
1892 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1893 }
1894
1895
1896 /**
1897 * Generate polynomial.
1898 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1899 */
1900 static LLVMValueRef
1901 lp_build_polynomial(struct lp_build_context *bld,
1902 LLVMValueRef x,
1903 const double *coeffs,
1904 unsigned num_coeffs)
1905 {
1906 const struct lp_type type = bld->type;
1907 LLVMValueRef res = NULL;
1908 unsigned i;
1909
1910 assert(lp_check_value(bld->type, x));
1911
1912 /* TODO: optimize the constant case */
1913 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
1914 LLVMIsConstant(x)) {
1915 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1916 __FUNCTION__);
1917 }
1918
1919 for (i = num_coeffs; i--; ) {
1920 LLVMValueRef coeff;
1921
1922 coeff = lp_build_const_vec(type, coeffs[i]);
1923
1924 if(res)
1925 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1926 else
1927 res = coeff;
1928 }
1929
1930 if(res)
1931 return res;
1932 else
1933 return bld->undef;
1934 }
1935
1936
1937 /**
1938 * Minimax polynomial fit of 2**x, in range [0, 1[
1939 */
1940 const double lp_build_exp2_polynomial[] = {
1941 #if EXP_POLY_DEGREE == 5
1942 0.999999999690134838155,
1943 0.583974334321735217258,
1944 0.164553105719676828492,
1945 0.0292811063701710962255,
1946 0.00354944426657875141846,
1947 0.000296253726543423377365
1948 #elif EXP_POLY_DEGREE == 4
1949 1.00000001502262084505,
1950 0.563586057338685991394,
1951 0.150436017652442413623,
1952 0.0243220604213317927308,
1953 0.0025359088446580436489
1954 #elif EXP_POLY_DEGREE == 3
1955 0.999925218562710312959,
1956 0.695833540494823811697,
1957 0.226067155427249155588,
1958 0.0780245226406372992967
1959 #elif EXP_POLY_DEGREE == 2
1960 1.00172476321474503578,
1961 0.657636275736077639316,
1962 0.33718943461968720704
1963 #else
1964 #error
1965 #endif
1966 };
1967
1968
1969 void
1970 lp_build_exp2_approx(struct lp_build_context *bld,
1971 LLVMValueRef x,
1972 LLVMValueRef *p_exp2_int_part,
1973 LLVMValueRef *p_frac_part,
1974 LLVMValueRef *p_exp2)
1975 {
1976 const struct lp_type type = bld->type;
1977 LLVMTypeRef vec_type = lp_build_vec_type(type);
1978 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1979 LLVMValueRef ipart = NULL;
1980 LLVMValueRef fpart = NULL;
1981 LLVMValueRef expipart = NULL;
1982 LLVMValueRef expfpart = NULL;
1983 LLVMValueRef res = NULL;
1984
1985 assert(lp_check_value(bld->type, x));
1986
1987 if(p_exp2_int_part || p_frac_part || p_exp2) {
1988 /* TODO: optimize the constant case */
1989 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
1990 LLVMIsConstant(x)) {
1991 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1992 __FUNCTION__);
1993 }
1994
1995 assert(type.floating && type.width == 32);
1996
1997 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1998 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1999
2000 /* ipart = floor(x) */
2001 ipart = lp_build_floor(bld, x);
2002
2003 /* fpart = x - ipart */
2004 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
2005 }
2006
2007 if(p_exp2_int_part || p_exp2) {
2008 /* expipart = (float) (1 << ipart) */
2009 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
2010 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2011 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
2012 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
2013 }
2014
2015 if(p_exp2) {
2016 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2017 Elements(lp_build_exp2_polynomial));
2018
2019 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2020 }
2021
2022 if(p_exp2_int_part)
2023 *p_exp2_int_part = expipart;
2024
2025 if(p_frac_part)
2026 *p_frac_part = fpart;
2027
2028 if(p_exp2)
2029 *p_exp2 = res;
2030 }
2031
2032
2033 LLVMValueRef
2034 lp_build_exp2(struct lp_build_context *bld,
2035 LLVMValueRef x)
2036 {
2037 LLVMValueRef res;
2038 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2039 return res;
2040 }
2041
2042
2043 /**
2044 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2045 * These coefficients can be generate with
2046 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2047 */
2048 const double lp_build_log2_polynomial[] = {
2049 #if LOG_POLY_DEGREE == 6
2050 3.11578814719469302614,
2051 -3.32419399085241980044,
2052 2.59883907202499966007,
2053 -1.23152682416275988241,
2054 0.318212422185251071475,
2055 -0.0344359067839062357313
2056 #elif LOG_POLY_DEGREE == 5
2057 2.8882704548164776201,
2058 -2.52074962577807006663,
2059 1.48116647521213171641,
2060 -0.465725644288844778798,
2061 0.0596515482674574969533
2062 #elif LOG_POLY_DEGREE == 4
2063 2.61761038894603480148,
2064 -1.75647175389045657003,
2065 0.688243882994381274313,
2066 -0.107254423828329604454
2067 #elif LOG_POLY_DEGREE == 3
2068 2.28330284476918490682,
2069 -1.04913055217340124191,
2070 0.204446009836232697516
2071 #else
2072 #error
2073 #endif
2074 };
2075
2076
2077 /**
2078 * See http://www.devmaster.net/forums/showthread.php?p=43580
2079 */
2080 void
2081 lp_build_log2_approx(struct lp_build_context *bld,
2082 LLVMValueRef x,
2083 LLVMValueRef *p_exp,
2084 LLVMValueRef *p_floor_log2,
2085 LLVMValueRef *p_log2)
2086 {
2087 const struct lp_type type = bld->type;
2088 LLVMTypeRef vec_type = lp_build_vec_type(type);
2089 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2090
2091 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2092 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2093 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2094
2095 LLVMValueRef i = NULL;
2096 LLVMValueRef exp = NULL;
2097 LLVMValueRef mant = NULL;
2098 LLVMValueRef logexp = NULL;
2099 LLVMValueRef logmant = NULL;
2100 LLVMValueRef res = NULL;
2101
2102 assert(lp_check_value(bld->type, x));
2103
2104 if(p_exp || p_floor_log2 || p_log2) {
2105 /* TODO: optimize the constant case */
2106 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2107 LLVMIsConstant(x)) {
2108 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2109 __FUNCTION__);
2110 }
2111
2112 assert(type.floating && type.width == 32);
2113
2114 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2115
2116 /* exp = (float) exponent(x) */
2117 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2118 }
2119
2120 if(p_floor_log2 || p_log2) {
2121 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2122 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2123 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2124 }
2125
2126 if(p_log2) {
2127 /* mant = (float) mantissa(x) */
2128 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2129 mant = LLVMBuildOr(bld->builder, mant, one, "");
2130 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2131
2132 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2133 Elements(lp_build_log2_polynomial));
2134
2135 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2136 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2137
2138 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2139 }
2140
2141 if(p_exp) {
2142 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2143 *p_exp = exp;
2144 }
2145
2146 if(p_floor_log2)
2147 *p_floor_log2 = logexp;
2148
2149 if(p_log2)
2150 *p_log2 = res;
2151 }
2152
2153
2154 LLVMValueRef
2155 lp_build_log2(struct lp_build_context *bld,
2156 LLVMValueRef x)
2157 {
2158 LLVMValueRef res;
2159 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2160 return res;
2161 }