Merge remote branch 'origin/master' into nv50-compiler
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
60
61
62 #define EXP_POLY_DEGREE 3
63
64 #define LOG_POLY_DEGREE 5
65
66
67 /**
68 * Generate min(a, b)
69 * No checks for special case values of a or b = 1 or 0 are done.
70 */
71 static LLVMValueRef
72 lp_build_min_simple(struct lp_build_context *bld,
73 LLVMValueRef a,
74 LLVMValueRef b)
75 {
76 const struct lp_type type = bld->type;
77 const char *intrinsic = NULL;
78 LLVMValueRef cond;
79
80 assert(lp_check_value(type, a));
81 assert(lp_check_value(type, b));
82
83 /* TODO: optimize the constant case */
84
85 if(type.width * type.length == 128) {
86 if(type.floating) {
87 if(type.width == 32 && util_cpu_caps.has_sse)
88 intrinsic = "llvm.x86.sse.min.ps";
89 if(type.width == 64 && util_cpu_caps.has_sse2)
90 intrinsic = "llvm.x86.sse2.min.pd";
91 }
92 else {
93 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
94 intrinsic = "llvm.x86.sse2.pminu.b";
95 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
96 intrinsic = "llvm.x86.sse41.pminsb";
97 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
98 intrinsic = "llvm.x86.sse41.pminuw";
99 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
100 intrinsic = "llvm.x86.sse2.pmins.w";
101 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
102 intrinsic = "llvm.x86.sse41.pminud";
103 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
104 intrinsic = "llvm.x86.sse41.pminsd";
105 }
106 }
107
108 if(intrinsic)
109 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
110
111 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
112 return lp_build_select(bld, cond, a, b);
113 }
114
115
116 /**
117 * Generate max(a, b)
118 * No checks for special case values of a or b = 1 or 0 are done.
119 */
120 static LLVMValueRef
121 lp_build_max_simple(struct lp_build_context *bld,
122 LLVMValueRef a,
123 LLVMValueRef b)
124 {
125 const struct lp_type type = bld->type;
126 const char *intrinsic = NULL;
127 LLVMValueRef cond;
128
129 assert(lp_check_value(type, a));
130 assert(lp_check_value(type, b));
131
132 /* TODO: optimize the constant case */
133
134 if(type.width * type.length == 128) {
135 if(type.floating) {
136 if(type.width == 32 && util_cpu_caps.has_sse)
137 intrinsic = "llvm.x86.sse.max.ps";
138 if(type.width == 64 && util_cpu_caps.has_sse2)
139 intrinsic = "llvm.x86.sse2.max.pd";
140 }
141 else {
142 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
143 intrinsic = "llvm.x86.sse2.pmaxu.b";
144 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
145 intrinsic = "llvm.x86.sse41.pmaxsb";
146 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
147 intrinsic = "llvm.x86.sse41.pmaxuw";
148 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
149 intrinsic = "llvm.x86.sse2.pmaxs.w";
150 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
151 intrinsic = "llvm.x86.sse41.pmaxud";
152 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
153 intrinsic = "llvm.x86.sse41.pmaxsd";
154 }
155 }
156
157 if(intrinsic)
158 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
159
160 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
161 return lp_build_select(bld, cond, a, b);
162 }
163
164
165 /**
166 * Generate 1 - a, or ~a depending on bld->type.
167 */
168 LLVMValueRef
169 lp_build_comp(struct lp_build_context *bld,
170 LLVMValueRef a)
171 {
172 const struct lp_type type = bld->type;
173
174 assert(lp_check_value(type, a));
175
176 if(a == bld->one)
177 return bld->zero;
178 if(a == bld->zero)
179 return bld->one;
180
181 if(type.norm && !type.floating && !type.fixed && !type.sign) {
182 if(LLVMIsConstant(a))
183 return LLVMConstNot(a);
184 else
185 return LLVMBuildNot(bld->builder, a, "");
186 }
187
188 if(LLVMIsConstant(a))
189 if (type.floating)
190 return LLVMConstFSub(bld->one, a);
191 else
192 return LLVMConstSub(bld->one, a);
193 else
194 if (type.floating)
195 return LLVMBuildFSub(bld->builder, bld->one, a, "");
196 else
197 return LLVMBuildSub(bld->builder, bld->one, a, "");
198 }
199
200
201 /**
202 * Generate a + b
203 */
204 LLVMValueRef
205 lp_build_add(struct lp_build_context *bld,
206 LLVMValueRef a,
207 LLVMValueRef b)
208 {
209 const struct lp_type type = bld->type;
210 LLVMValueRef res;
211
212 assert(lp_check_value(type, a));
213 assert(lp_check_value(type, b));
214
215 if(a == bld->zero)
216 return b;
217 if(b == bld->zero)
218 return a;
219 if(a == bld->undef || b == bld->undef)
220 return bld->undef;
221
222 if(bld->type.norm) {
223 const char *intrinsic = NULL;
224
225 if(a == bld->one || b == bld->one)
226 return bld->one;
227
228 if(util_cpu_caps.has_sse2 &&
229 type.width * type.length == 128 &&
230 !type.floating && !type.fixed) {
231 if(type.width == 8)
232 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
233 if(type.width == 16)
234 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
235 }
236
237 if(intrinsic)
238 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
239 }
240
241 if(LLVMIsConstant(a) && LLVMIsConstant(b))
242 if (type.floating)
243 res = LLVMConstFAdd(a, b);
244 else
245 res = LLVMConstAdd(a, b);
246 else
247 if (type.floating)
248 res = LLVMBuildFAdd(bld->builder, a, b, "");
249 else
250 res = LLVMBuildAdd(bld->builder, a, b, "");
251
252 /* clamp to ceiling of 1.0 */
253 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
254 res = lp_build_min_simple(bld, res, bld->one);
255
256 /* XXX clamp to floor of -1 or 0??? */
257
258 return res;
259 }
260
261
262 /** Return the scalar sum of the elements of a */
263 LLVMValueRef
264 lp_build_sum_vector(struct lp_build_context *bld,
265 LLVMValueRef a)
266 {
267 const struct lp_type type = bld->type;
268 LLVMValueRef index, res;
269 unsigned i;
270
271 assert(lp_check_value(type, a));
272
273 if (type.length == 1) {
274 return a;
275 }
276
277 assert(!bld->type.norm);
278
279 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
280 res = LLVMBuildExtractElement(bld->builder, a, index, "");
281
282 for (i = 1; i < type.length; i++) {
283 index = LLVMConstInt(LLVMInt32Type(), i, 0);
284 if (type.floating)
285 res = LLVMBuildFAdd(bld->builder, res,
286 LLVMBuildExtractElement(bld->builder,
287 a, index, ""),
288 "");
289 else
290 res = LLVMBuildAdd(bld->builder, res,
291 LLVMBuildExtractElement(bld->builder,
292 a, index, ""),
293 "");
294 }
295
296 return res;
297 }
298
299
300 /**
301 * Generate a - b
302 */
303 LLVMValueRef
304 lp_build_sub(struct lp_build_context *bld,
305 LLVMValueRef a,
306 LLVMValueRef b)
307 {
308 const struct lp_type type = bld->type;
309 LLVMValueRef res;
310
311 assert(lp_check_value(type, a));
312 assert(lp_check_value(type, b));
313
314 if(b == bld->zero)
315 return a;
316 if(a == bld->undef || b == bld->undef)
317 return bld->undef;
318 if(a == b)
319 return bld->zero;
320
321 if(bld->type.norm) {
322 const char *intrinsic = NULL;
323
324 if(b == bld->one)
325 return bld->zero;
326
327 if(util_cpu_caps.has_sse2 &&
328 type.width * type.length == 128 &&
329 !type.floating && !type.fixed) {
330 if(type.width == 8)
331 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
332 if(type.width == 16)
333 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
334 }
335
336 if(intrinsic)
337 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
338 }
339
340 if(LLVMIsConstant(a) && LLVMIsConstant(b))
341 if (type.floating)
342 res = LLVMConstFSub(a, b);
343 else
344 res = LLVMConstSub(a, b);
345 else
346 if (type.floating)
347 res = LLVMBuildFSub(bld->builder, a, b, "");
348 else
349 res = LLVMBuildSub(bld->builder, a, b, "");
350
351 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
352 res = lp_build_max_simple(bld, res, bld->zero);
353
354 return res;
355 }
356
357
358 /**
359 * Normalized 8bit multiplication.
360 *
361 * - alpha plus one
362 *
363 * makes the following approximation to the division (Sree)
364 *
365 * a*b/255 ~= (a*(b + 1)) >> 256
366 *
367 * which is the fastest method that satisfies the following OpenGL criteria
368 *
369 * 0*0 = 0 and 255*255 = 255
370 *
371 * - geometric series
372 *
373 * takes the geometric series approximation to the division
374 *
375 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
376 *
377 * in this case just the first two terms to fit in 16bit arithmetic
378 *
379 * t/255 ~= (t + (t >> 8)) >> 8
380 *
381 * note that just by itself it doesn't satisfies the OpenGL criteria, as
382 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
383 * must be used
384 *
385 * - geometric series plus rounding
386 *
387 * when using a geometric series division instead of truncating the result
388 * use roundoff in the approximation (Jim Blinn)
389 *
390 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
391 *
392 * achieving the exact results
393 *
394 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
395 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
396 * @sa Michael Herf, The "double blend trick", May 2000,
397 * http://www.stereopsis.com/doubleblend.html
398 */
399 static LLVMValueRef
400 lp_build_mul_u8n(LLVMBuilderRef builder,
401 struct lp_type i16_type,
402 LLVMValueRef a, LLVMValueRef b)
403 {
404 LLVMValueRef c8;
405 LLVMValueRef ab;
406
407 assert(!i16_type.floating);
408 assert(lp_check_value(i16_type, a));
409 assert(lp_check_value(i16_type, b));
410
411 c8 = lp_build_const_int_vec(i16_type, 8);
412
413 #if 0
414
415 /* a*b/255 ~= (a*(b + 1)) >> 256 */
416 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
417 ab = LLVMBuildMul(builder, a, b, "");
418
419 #else
420
421 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
422 ab = LLVMBuildMul(builder, a, b, "");
423 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
424 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
425
426 #endif
427
428 ab = LLVMBuildLShr(builder, ab, c8, "");
429
430 return ab;
431 }
432
433
434 /**
435 * Generate a * b
436 */
437 LLVMValueRef
438 lp_build_mul(struct lp_build_context *bld,
439 LLVMValueRef a,
440 LLVMValueRef b)
441 {
442 const struct lp_type type = bld->type;
443 LLVMValueRef shift;
444 LLVMValueRef res;
445
446 assert(lp_check_value(type, a));
447 assert(lp_check_value(type, b));
448
449 if(a == bld->zero)
450 return bld->zero;
451 if(a == bld->one)
452 return b;
453 if(b == bld->zero)
454 return bld->zero;
455 if(b == bld->one)
456 return a;
457 if(a == bld->undef || b == bld->undef)
458 return bld->undef;
459
460 if(!type.floating && !type.fixed && type.norm) {
461 if(type.width == 8) {
462 struct lp_type i16_type = lp_wider_type(type);
463 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
464
465 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
466 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
467
468 /* PMULLW, PSRLW, PADDW */
469 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
470 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
471
472 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
473
474 return ab;
475 }
476
477 /* FIXME */
478 assert(0);
479 }
480
481 if(type.fixed)
482 shift = lp_build_const_int_vec(type, type.width/2);
483 else
484 shift = NULL;
485
486 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
487 if (type.floating)
488 res = LLVMConstFMul(a, b);
489 else
490 res = LLVMConstMul(a, b);
491 if(shift) {
492 if(type.sign)
493 res = LLVMConstAShr(res, shift);
494 else
495 res = LLVMConstLShr(res, shift);
496 }
497 }
498 else {
499 if (type.floating)
500 res = LLVMBuildFMul(bld->builder, a, b, "");
501 else
502 res = LLVMBuildMul(bld->builder, a, b, "");
503 if(shift) {
504 if(type.sign)
505 res = LLVMBuildAShr(bld->builder, res, shift, "");
506 else
507 res = LLVMBuildLShr(bld->builder, res, shift, "");
508 }
509 }
510
511 return res;
512 }
513
514
515 /**
516 * Small vector x scale multiplication optimization.
517 */
518 LLVMValueRef
519 lp_build_mul_imm(struct lp_build_context *bld,
520 LLVMValueRef a,
521 int b)
522 {
523 LLVMValueRef factor;
524
525 assert(lp_check_value(bld->type, a));
526
527 if(b == 0)
528 return bld->zero;
529
530 if(b == 1)
531 return a;
532
533 if(b == -1)
534 return lp_build_negate(bld, a);
535
536 if(b == 2 && bld->type.floating)
537 return lp_build_add(bld, a, a);
538
539 if(util_is_power_of_two(b)) {
540 unsigned shift = ffs(b) - 1;
541
542 if(bld->type.floating) {
543 #if 0
544 /*
545 * Power of two multiplication by directly manipulating the mantissa.
546 *
547 * XXX: This might not be always faster, it will introduce a small error
548 * for multiplication by zero, and it will produce wrong results
549 * for Inf and NaN.
550 */
551 unsigned mantissa = lp_mantissa(bld->type);
552 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
553 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
554 a = LLVMBuildAdd(bld->builder, a, factor, "");
555 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
556 return a;
557 #endif
558 }
559 else {
560 factor = lp_build_const_vec(bld->type, shift);
561 return LLVMBuildShl(bld->builder, a, factor, "");
562 }
563 }
564
565 factor = lp_build_const_vec(bld->type, (double)b);
566 return lp_build_mul(bld, a, factor);
567 }
568
569
570 /**
571 * Generate a / b
572 */
573 LLVMValueRef
574 lp_build_div(struct lp_build_context *bld,
575 LLVMValueRef a,
576 LLVMValueRef b)
577 {
578 const struct lp_type type = bld->type;
579
580 assert(lp_check_value(type, a));
581 assert(lp_check_value(type, b));
582
583 if(a == bld->zero)
584 return bld->zero;
585 if(a == bld->one)
586 return lp_build_rcp(bld, b);
587 if(b == bld->zero)
588 return bld->undef;
589 if(b == bld->one)
590 return a;
591 if(a == bld->undef || b == bld->undef)
592 return bld->undef;
593
594 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
595 if (type.floating)
596 return LLVMConstFDiv(a, b);
597 else if (type.sign)
598 return LLVMConstSDiv(a, b);
599 else
600 return LLVMConstUDiv(a, b);
601 }
602
603 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
604 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
605
606 if (type.floating)
607 return LLVMBuildFDiv(bld->builder, a, b, "");
608 else if (type.sign)
609 return LLVMBuildSDiv(bld->builder, a, b, "");
610 else
611 return LLVMBuildUDiv(bld->builder, a, b, "");
612 }
613
614
615 /**
616 * Linear interpolation.
617 *
618 * This also works for integer values with a few caveats.
619 *
620 * @sa http://www.stereopsis.com/doubleblend.html
621 */
622 LLVMValueRef
623 lp_build_lerp(struct lp_build_context *bld,
624 LLVMValueRef x,
625 LLVMValueRef v0,
626 LLVMValueRef v1)
627 {
628 LLVMValueRef delta;
629 LLVMValueRef res;
630
631 assert(lp_check_value(bld->type, x));
632 assert(lp_check_value(bld->type, v0));
633 assert(lp_check_value(bld->type, v1));
634
635 delta = lp_build_sub(bld, v1, v0);
636
637 res = lp_build_mul(bld, x, delta);
638
639 res = lp_build_add(bld, v0, res);
640
641 if(bld->type.fixed)
642 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
643 * but it will be wrong for other uses. Basically we need a more
644 * powerful lp_type, capable of further distinguishing the values
645 * interpretation from the value storage. */
646 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
647
648 return res;
649 }
650
651
652 LLVMValueRef
653 lp_build_lerp_2d(struct lp_build_context *bld,
654 LLVMValueRef x,
655 LLVMValueRef y,
656 LLVMValueRef v00,
657 LLVMValueRef v01,
658 LLVMValueRef v10,
659 LLVMValueRef v11)
660 {
661 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
662 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
663 return lp_build_lerp(bld, y, v0, v1);
664 }
665
666
667 /**
668 * Generate min(a, b)
669 * Do checks for special cases.
670 */
671 LLVMValueRef
672 lp_build_min(struct lp_build_context *bld,
673 LLVMValueRef a,
674 LLVMValueRef b)
675 {
676 assert(lp_check_value(bld->type, a));
677 assert(lp_check_value(bld->type, b));
678
679 if(a == bld->undef || b == bld->undef)
680 return bld->undef;
681
682 if(a == b)
683 return a;
684
685 if(bld->type.norm) {
686 if(a == bld->zero || b == bld->zero)
687 return bld->zero;
688 if(a == bld->one)
689 return b;
690 if(b == bld->one)
691 return a;
692 }
693
694 return lp_build_min_simple(bld, a, b);
695 }
696
697
698 /**
699 * Generate max(a, b)
700 * Do checks for special cases.
701 */
702 LLVMValueRef
703 lp_build_max(struct lp_build_context *bld,
704 LLVMValueRef a,
705 LLVMValueRef b)
706 {
707 assert(lp_check_value(bld->type, a));
708 assert(lp_check_value(bld->type, b));
709
710 if(a == bld->undef || b == bld->undef)
711 return bld->undef;
712
713 if(a == b)
714 return a;
715
716 if(bld->type.norm) {
717 if(a == bld->one || b == bld->one)
718 return bld->one;
719 if(a == bld->zero)
720 return b;
721 if(b == bld->zero)
722 return a;
723 }
724
725 return lp_build_max_simple(bld, a, b);
726 }
727
728
729 /**
730 * Generate clamp(a, min, max)
731 * Do checks for special cases.
732 */
733 LLVMValueRef
734 lp_build_clamp(struct lp_build_context *bld,
735 LLVMValueRef a,
736 LLVMValueRef min,
737 LLVMValueRef max)
738 {
739 assert(lp_check_value(bld->type, a));
740 assert(lp_check_value(bld->type, min));
741 assert(lp_check_value(bld->type, max));
742
743 a = lp_build_min(bld, a, max);
744 a = lp_build_max(bld, a, min);
745 return a;
746 }
747
748
749 /**
750 * Generate abs(a)
751 */
752 LLVMValueRef
753 lp_build_abs(struct lp_build_context *bld,
754 LLVMValueRef a)
755 {
756 const struct lp_type type = bld->type;
757 LLVMTypeRef vec_type = lp_build_vec_type(type);
758
759 assert(lp_check_value(type, a));
760
761 if(!type.sign)
762 return a;
763
764 if(type.floating) {
765 /* Mask out the sign bit */
766 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
767 unsigned long long absMask = ~(1ULL << (type.width - 1));
768 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
769 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
770 a = LLVMBuildAnd(bld->builder, a, mask, "");
771 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
772 return a;
773 }
774
775 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
776 switch(type.width) {
777 case 8:
778 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
779 case 16:
780 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
781 case 32:
782 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
783 }
784 }
785
786 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
787 }
788
789
790 LLVMValueRef
791 lp_build_negate(struct lp_build_context *bld,
792 LLVMValueRef a)
793 {
794 assert(lp_check_value(bld->type, a));
795
796 #if HAVE_LLVM >= 0x0207
797 if (bld->type.floating)
798 a = LLVMBuildFNeg(bld->builder, a, "");
799 else
800 #endif
801 a = LLVMBuildNeg(bld->builder, a, "");
802
803 return a;
804 }
805
806
807 /** Return -1, 0 or +1 depending on the sign of a */
808 LLVMValueRef
809 lp_build_sgn(struct lp_build_context *bld,
810 LLVMValueRef a)
811 {
812 const struct lp_type type = bld->type;
813 LLVMValueRef cond;
814 LLVMValueRef res;
815
816 assert(lp_check_value(type, a));
817
818 /* Handle non-zero case */
819 if(!type.sign) {
820 /* if not zero then sign must be positive */
821 res = bld->one;
822 }
823 else if(type.floating) {
824 LLVMTypeRef vec_type;
825 LLVMTypeRef int_type;
826 LLVMValueRef mask;
827 LLVMValueRef sign;
828 LLVMValueRef one;
829 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
830
831 int_type = lp_build_int_vec_type(type);
832 vec_type = lp_build_vec_type(type);
833 mask = lp_build_const_int_vec(type, maskBit);
834
835 /* Take the sign bit and add it to 1 constant */
836 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
837 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
838 one = LLVMConstBitCast(bld->one, int_type);
839 res = LLVMBuildOr(bld->builder, sign, one, "");
840 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
841 }
842 else
843 {
844 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
845 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
846 res = lp_build_select(bld, cond, bld->one, minus_one);
847 }
848
849 /* Handle zero */
850 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
851 res = lp_build_select(bld, cond, bld->zero, res);
852
853 return res;
854 }
855
856
857 /**
858 * Set the sign of float vector 'a' according to 'sign'.
859 * If sign==0, return abs(a).
860 * If sign==1, return -abs(a);
861 * Other values for sign produce undefined results.
862 */
863 LLVMValueRef
864 lp_build_set_sign(struct lp_build_context *bld,
865 LLVMValueRef a, LLVMValueRef sign)
866 {
867 const struct lp_type type = bld->type;
868 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
869 LLVMTypeRef vec_type = lp_build_vec_type(type);
870 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
871 LLVMValueRef mask = lp_build_const_int_vec(type,
872 ~((unsigned long long) 1 << (type.width - 1)));
873 LLVMValueRef val, res;
874
875 assert(type.floating);
876 assert(lp_check_value(type, a));
877
878 /* val = reinterpret_cast<int>(a) */
879 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
880 /* val = val & mask */
881 val = LLVMBuildAnd(bld->builder, val, mask, "");
882 /* sign = sign << shift */
883 sign = LLVMBuildShl(bld->builder, sign, shift, "");
884 /* res = val | sign */
885 res = LLVMBuildOr(bld->builder, val, sign, "");
886 /* res = reinterpret_cast<float>(res) */
887 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
888
889 return res;
890 }
891
892
893 /**
894 * Convert vector of (or scalar) int to vector of (or scalar) float.
895 */
896 LLVMValueRef
897 lp_build_int_to_float(struct lp_build_context *bld,
898 LLVMValueRef a)
899 {
900 const struct lp_type type = bld->type;
901 LLVMTypeRef vec_type = lp_build_vec_type(type);
902
903 assert(type.floating);
904
905 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
906 }
907
908
909
910 enum lp_build_round_sse41_mode
911 {
912 LP_BUILD_ROUND_SSE41_NEAREST = 0,
913 LP_BUILD_ROUND_SSE41_FLOOR = 1,
914 LP_BUILD_ROUND_SSE41_CEIL = 2,
915 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
916 };
917
918
919 static INLINE LLVMValueRef
920 lp_build_round_sse41(struct lp_build_context *bld,
921 LLVMValueRef a,
922 enum lp_build_round_sse41_mode mode)
923 {
924 const struct lp_type type = bld->type;
925 LLVMTypeRef vec_type = lp_build_vec_type(type);
926 const char *intrinsic;
927
928 assert(type.floating);
929 assert(type.width*type.length == 128);
930 assert(lp_check_value(type, a));
931 assert(util_cpu_caps.has_sse4_1);
932
933 switch(type.width) {
934 case 32:
935 intrinsic = "llvm.x86.sse41.round.ps";
936 break;
937 case 64:
938 intrinsic = "llvm.x86.sse41.round.pd";
939 break;
940 default:
941 assert(0);
942 return bld->undef;
943 }
944
945 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
946 LLVMConstInt(LLVMInt32Type(), mode, 0));
947 }
948
949
950 /**
951 * Return the integer part of a float (vector) value. The returned value is
952 * a float (vector).
953 * Ex: trunc(-1.5) = 1.0
954 */
955 LLVMValueRef
956 lp_build_trunc(struct lp_build_context *bld,
957 LLVMValueRef a)
958 {
959 const struct lp_type type = bld->type;
960
961 assert(type.floating);
962 assert(lp_check_value(type, a));
963
964 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
965 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
966 else {
967 LLVMTypeRef vec_type = lp_build_vec_type(type);
968 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
969 LLVMValueRef res;
970 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
971 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
972 return res;
973 }
974 }
975
976
977 /**
978 * Return float (vector) rounded to nearest integer (vector). The returned
979 * value is a float (vector).
980 * Ex: round(0.9) = 1.0
981 * Ex: round(-1.5) = -2.0
982 */
983 LLVMValueRef
984 lp_build_round(struct lp_build_context *bld,
985 LLVMValueRef a)
986 {
987 const struct lp_type type = bld->type;
988
989 assert(type.floating);
990 assert(lp_check_value(type, a));
991
992 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
993 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
994 else {
995 LLVMTypeRef vec_type = lp_build_vec_type(type);
996 LLVMValueRef res;
997 res = lp_build_iround(bld, a);
998 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
999 return res;
1000 }
1001 }
1002
1003
1004 /**
1005 * Return floor of float (vector), result is a float (vector)
1006 * Ex: floor(1.1) = 1.0
1007 * Ex: floor(-1.1) = -2.0
1008 */
1009 LLVMValueRef
1010 lp_build_floor(struct lp_build_context *bld,
1011 LLVMValueRef a)
1012 {
1013 const struct lp_type type = bld->type;
1014
1015 assert(type.floating);
1016 assert(lp_check_value(type, a));
1017
1018 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1019 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1020 else {
1021 LLVMTypeRef vec_type = lp_build_vec_type(type);
1022 LLVMValueRef res;
1023 res = lp_build_ifloor(bld, a);
1024 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1025 return res;
1026 }
1027 }
1028
1029
1030 /**
1031 * Return ceiling of float (vector), returning float (vector).
1032 * Ex: ceil( 1.1) = 2.0
1033 * Ex: ceil(-1.1) = -1.0
1034 */
1035 LLVMValueRef
1036 lp_build_ceil(struct lp_build_context *bld,
1037 LLVMValueRef a)
1038 {
1039 const struct lp_type type = bld->type;
1040
1041 assert(type.floating);
1042 assert(lp_check_value(type, a));
1043
1044 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1045 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1046 else {
1047 LLVMTypeRef vec_type = lp_build_vec_type(type);
1048 LLVMValueRef res;
1049 res = lp_build_iceil(bld, a);
1050 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1051 return res;
1052 }
1053 }
1054
1055
1056 /**
1057 * Return fractional part of 'a' computed as a - floor(a)
1058 * Typically used in texture coord arithmetic.
1059 */
1060 LLVMValueRef
1061 lp_build_fract(struct lp_build_context *bld,
1062 LLVMValueRef a)
1063 {
1064 assert(bld->type.floating);
1065 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1066 }
1067
1068
1069 /**
1070 * Return the integer part of a float (vector) value. The returned value is
1071 * an integer (vector).
1072 * Ex: itrunc(-1.5) = 1
1073 */
1074 LLVMValueRef
1075 lp_build_itrunc(struct lp_build_context *bld,
1076 LLVMValueRef a)
1077 {
1078 const struct lp_type type = bld->type;
1079 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1080
1081 assert(type.floating);
1082 assert(lp_check_value(type, a));
1083
1084 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1085 }
1086
1087
1088 /**
1089 * Return float (vector) rounded to nearest integer (vector). The returned
1090 * value is an integer (vector).
1091 * Ex: iround(0.9) = 1
1092 * Ex: iround(-1.5) = -2
1093 */
1094 LLVMValueRef
1095 lp_build_iround(struct lp_build_context *bld,
1096 LLVMValueRef a)
1097 {
1098 const struct lp_type type = bld->type;
1099 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1100 LLVMValueRef res;
1101
1102 assert(type.floating);
1103
1104 assert(lp_check_value(type, a));
1105
1106 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1107 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1108 }
1109 else {
1110 LLVMTypeRef vec_type = lp_build_vec_type(type);
1111 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1112 LLVMValueRef sign;
1113 LLVMValueRef half;
1114
1115 /* get sign bit */
1116 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1117 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1118
1119 /* sign * 0.5 */
1120 half = lp_build_const_vec(type, 0.5);
1121 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1122 half = LLVMBuildOr(bld->builder, sign, half, "");
1123 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1124
1125 res = LLVMBuildFAdd(bld->builder, a, half, "");
1126 }
1127
1128 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1129
1130 return res;
1131 }
1132
1133
1134 /**
1135 * Return floor of float (vector), result is an int (vector)
1136 * Ex: ifloor(1.1) = 1.0
1137 * Ex: ifloor(-1.1) = -2.0
1138 */
1139 LLVMValueRef
1140 lp_build_ifloor(struct lp_build_context *bld,
1141 LLVMValueRef a)
1142 {
1143 const struct lp_type type = bld->type;
1144 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1145 LLVMValueRef res;
1146
1147 assert(type.floating);
1148 assert(lp_check_value(type, a));
1149
1150 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1151 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1152 }
1153 else {
1154 /* Take the sign bit and add it to 1 constant */
1155 LLVMTypeRef vec_type = lp_build_vec_type(type);
1156 unsigned mantissa = lp_mantissa(type);
1157 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1158 LLVMValueRef sign;
1159 LLVMValueRef offset;
1160
1161 /* sign = a < 0 ? ~0 : 0 */
1162 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1163 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1164 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1165
1166 /* offset = -0.99999(9)f */
1167 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1168 offset = LLVMConstBitCast(offset, int_vec_type);
1169
1170 /* offset = a < 0 ? offset : 0.0f */
1171 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1172 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1173
1174 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1175 }
1176
1177 /* round to nearest (toward zero) */
1178 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1179
1180 return res;
1181 }
1182
1183
1184 /**
1185 * Return ceiling of float (vector), returning int (vector).
1186 * Ex: iceil( 1.1) = 2
1187 * Ex: iceil(-1.1) = -1
1188 */
1189 LLVMValueRef
1190 lp_build_iceil(struct lp_build_context *bld,
1191 LLVMValueRef a)
1192 {
1193 const struct lp_type type = bld->type;
1194 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1195 LLVMValueRef res;
1196
1197 assert(type.floating);
1198 assert(lp_check_value(type, a));
1199
1200 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1201 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1202 }
1203 else {
1204 LLVMTypeRef vec_type = lp_build_vec_type(type);
1205 unsigned mantissa = lp_mantissa(type);
1206 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1207 LLVMValueRef sign;
1208 LLVMValueRef offset;
1209
1210 /* sign = a < 0 ? 0 : ~0 */
1211 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1212 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1213 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1214 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1215
1216 /* offset = 0.99999(9)f */
1217 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1218 offset = LLVMConstBitCast(offset, int_vec_type);
1219
1220 /* offset = a < 0 ? 0.0 : offset */
1221 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1222 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1223
1224 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1225 }
1226
1227 /* round to nearest (toward zero) */
1228 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1229
1230 return res;
1231 }
1232
1233
1234 LLVMValueRef
1235 lp_build_sqrt(struct lp_build_context *bld,
1236 LLVMValueRef a)
1237 {
1238 const struct lp_type type = bld->type;
1239 LLVMTypeRef vec_type = lp_build_vec_type(type);
1240 char intrinsic[32];
1241
1242 assert(lp_check_value(type, a));
1243
1244 /* TODO: optimize the constant case */
1245 /* TODO: optimize the constant case */
1246
1247 assert(type.floating);
1248 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1249
1250 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1251 }
1252
1253
1254 /**
1255 * Do one Newton-Raphson step to improve reciprocate precision:
1256 *
1257 * x_{i+1} = x_i * (2 - a * x_i)
1258 *
1259 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1260 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1261 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1262 * halo. It would be necessary to clamp the argument to prevent this.
1263 *
1264 * See also:
1265 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1266 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1267 */
1268 static INLINE LLVMValueRef
1269 lp_build_rcp_refine(struct lp_build_context *bld,
1270 LLVMValueRef a,
1271 LLVMValueRef rcp_a)
1272 {
1273 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1274 LLVMValueRef res;
1275
1276 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1277 res = LLVMBuildFSub(bld->builder, two, res, "");
1278 res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1279
1280 return res;
1281 }
1282
1283
1284 LLVMValueRef
1285 lp_build_rcp(struct lp_build_context *bld,
1286 LLVMValueRef a)
1287 {
1288 const struct lp_type type = bld->type;
1289
1290 assert(lp_check_value(type, a));
1291
1292 if(a == bld->zero)
1293 return bld->undef;
1294 if(a == bld->one)
1295 return bld->one;
1296 if(a == bld->undef)
1297 return bld->undef;
1298
1299 assert(type.floating);
1300
1301 if(LLVMIsConstant(a))
1302 return LLVMConstFDiv(bld->one, a);
1303
1304 /*
1305 * We don't use RCPPS because:
1306 * - it only has 10bits of precision
1307 * - it doesn't even get the reciprocate of 1.0 exactly
1308 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1309 * - for recent processors the benefit over DIVPS is marginal, a case
1310 * depedent
1311 *
1312 * We could still use it on certain processors if benchmarks show that the
1313 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1314 * particular uses that require less workarounds.
1315 */
1316
1317 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1318 const unsigned num_iterations = 0;
1319 LLVMValueRef res;
1320 unsigned i;
1321
1322 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1323
1324 for (i = 0; i < num_iterations; ++i) {
1325 res = lp_build_rcp_refine(bld, a, res);
1326 }
1327
1328 return res;
1329 }
1330
1331 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1332 }
1333
1334
1335 /**
1336 * Do one Newton-Raphson step to improve rsqrt precision:
1337 *
1338 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1339 *
1340 * See also:
1341 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1342 */
1343 static INLINE LLVMValueRef
1344 lp_build_rsqrt_refine(struct lp_build_context *bld,
1345 LLVMValueRef a,
1346 LLVMValueRef rsqrt_a)
1347 {
1348 LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1349 LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1350 LLVMValueRef res;
1351
1352 res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1353 res = LLVMBuildFMul(bld->builder, a, res, "");
1354 res = LLVMBuildFSub(bld->builder, three, res, "");
1355 res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1356 res = LLVMBuildFMul(bld->builder, half, res, "");
1357
1358 return res;
1359 }
1360
1361
1362 /**
1363 * Generate 1/sqrt(a)
1364 */
1365 LLVMValueRef
1366 lp_build_rsqrt(struct lp_build_context *bld,
1367 LLVMValueRef a)
1368 {
1369 const struct lp_type type = bld->type;
1370
1371 assert(lp_check_value(type, a));
1372
1373 assert(type.floating);
1374
1375 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1376 const unsigned num_iterations = 0;
1377 LLVMValueRef res;
1378 unsigned i;
1379
1380 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1381
1382 for (i = 0; i < num_iterations; ++i) {
1383 res = lp_build_rsqrt_refine(bld, a, res);
1384 }
1385
1386 return res;
1387 }
1388
1389 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1390 }
1391
1392
1393 static inline LLVMValueRef
1394 lp_build_const_v4si(unsigned long value)
1395 {
1396 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1397 LLVMValueRef elements[4] = { element, element, element, element };
1398 return LLVMConstVector(elements, 4);
1399 }
1400
1401 static inline LLVMValueRef
1402 lp_build_const_v4sf(float value)
1403 {
1404 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1405 LLVMValueRef elements[4] = { element, element, element, element };
1406 return LLVMConstVector(elements, 4);
1407 }
1408
1409
1410 /**
1411 * Generate sin(a) using SSE2
1412 */
1413 LLVMValueRef
1414 lp_build_sin(struct lp_build_context *bld,
1415 LLVMValueRef a)
1416 {
1417 struct lp_type int_type = lp_int_type(bld->type);
1418 LLVMBuilderRef b = bld->builder;
1419 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1420 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1421
1422 /*
1423 * take the absolute value,
1424 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1425 */
1426
1427 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1428 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1429
1430 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1431 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1432
1433 /*
1434 * extract the sign bit (upper one)
1435 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1436 */
1437 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1438 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1439
1440 /*
1441 * scale by 4/Pi
1442 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1443 */
1444
1445 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1446 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1447
1448 /*
1449 * store the integer part of y in mm0
1450 * emm2 = _mm_cvttps_epi32(y);
1451 */
1452
1453 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1454
1455 /*
1456 * j=(j+1) & (~1) (see the cephes sources)
1457 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1458 */
1459
1460 LLVMValueRef all_one = lp_build_const_v4si(1);
1461 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1462 /*
1463 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1464 */
1465 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1466 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1467
1468 /*
1469 * y = _mm_cvtepi32_ps(emm2);
1470 */
1471 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1472
1473 /* get the swap sign flag
1474 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1475 */
1476 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1477 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1478
1479 /*
1480 * emm2 = _mm_slli_epi32(emm0, 29);
1481 */
1482 LLVMValueRef const_29 = lp_build_const_v4si(29);
1483 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1484
1485 /*
1486 * get the polynom selection mask
1487 * there is one polynom for 0 <= x <= Pi/4
1488 * and another one for Pi/4<x<=Pi/2
1489 * Both branches will be computed.
1490 *
1491 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1492 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1493 */
1494
1495 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1496 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1497 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1498 emm2_3, lp_build_const_v4si(0));
1499 /*
1500 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1501 */
1502 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1503
1504 /*
1505 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1506 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1507 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1508 */
1509 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1510 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1511 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1512
1513 /*
1514 * The magic pass: "Extended precision modular arithmetic"
1515 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1516 * xmm1 = _mm_mul_ps(y, xmm1);
1517 * xmm2 = _mm_mul_ps(y, xmm2);
1518 * xmm3 = _mm_mul_ps(y, xmm3);
1519 */
1520 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1521 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1522 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1523
1524 /*
1525 * x = _mm_add_ps(x, xmm1);
1526 * x = _mm_add_ps(x, xmm2);
1527 * x = _mm_add_ps(x, xmm3);
1528 */
1529
1530 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1531 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1532 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1533
1534 /*
1535 * Evaluate the first polynom (0 <= x <= Pi/4)
1536 *
1537 * z = _mm_mul_ps(x,x);
1538 */
1539 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1540
1541 /*
1542 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1543 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1544 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1545 */
1546 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1547 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1548 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1549
1550 /*
1551 * y = *(v4sf*)_ps_coscof_p0;
1552 * y = _mm_mul_ps(y, z);
1553 */
1554 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1555 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1556 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1557 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1558 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1559 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1560
1561
1562 /*
1563 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1564 * y = _mm_sub_ps(y, tmp);
1565 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1566 */
1567 LLVMValueRef half = lp_build_const_v4sf(0.5);
1568 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1569 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1570 LLVMValueRef one = lp_build_const_v4sf(1.0);
1571 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1572
1573 /*
1574 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1575 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1576 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1577 */
1578 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1579 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1580 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1581
1582 /*
1583 * Evaluate the second polynom (Pi/4 <= x <= 0)
1584 *
1585 * y2 = *(v4sf*)_ps_sincof_p0;
1586 * y2 = _mm_mul_ps(y2, z);
1587 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1588 * y2 = _mm_mul_ps(y2, z);
1589 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1590 * y2 = _mm_mul_ps(y2, z);
1591 * y2 = _mm_mul_ps(y2, x);
1592 * y2 = _mm_add_ps(y2, x);
1593 */
1594
1595 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1596 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1597 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1598 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1599 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1600 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1601 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1602
1603 /*
1604 * select the correct result from the two polynoms
1605 * xmm3 = poly_mask;
1606 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1607 * y = _mm_andnot_ps(xmm3, y);
1608 * y = _mm_add_ps(y,y2);
1609 */
1610 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1611 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1612 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1613 LLVMValueRef inv = lp_build_const_v4si(~0);
1614 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1615 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1616 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1617
1618 /*
1619 * update the sign
1620 * y = _mm_xor_ps(y, sign_bit);
1621 */
1622 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1623 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1624 return y_result;
1625 }
1626
1627
1628 /**
1629 * Generate cos(a) using SSE2
1630 */
1631 LLVMValueRef
1632 lp_build_cos(struct lp_build_context *bld,
1633 LLVMValueRef a)
1634 {
1635 struct lp_type int_type = lp_int_type(bld->type);
1636 LLVMBuilderRef b = bld->builder;
1637 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1638 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1639
1640 /*
1641 * take the absolute value,
1642 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1643 */
1644
1645 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1646 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1647
1648 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1649 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1650
1651 /*
1652 * scale by 4/Pi
1653 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1654 */
1655
1656 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1657 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1658
1659 /*
1660 * store the integer part of y in mm0
1661 * emm2 = _mm_cvttps_epi32(y);
1662 */
1663
1664 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1665
1666 /*
1667 * j=(j+1) & (~1) (see the cephes sources)
1668 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1669 */
1670
1671 LLVMValueRef all_one = lp_build_const_v4si(1);
1672 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1673 /*
1674 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1675 */
1676 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1677 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1678
1679 /*
1680 * y = _mm_cvtepi32_ps(emm2);
1681 */
1682 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1683
1684
1685 /*
1686 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1687 */
1688 LLVMValueRef const_2 = lp_build_const_v4si(2);
1689 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1690
1691
1692 /* get the swap sign flag
1693 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1694 */
1695 LLVMValueRef inv = lp_build_const_v4si(~0);
1696 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1697 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1698 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1699
1700 /*
1701 * emm2 = _mm_slli_epi32(emm0, 29);
1702 */
1703 LLVMValueRef const_29 = lp_build_const_v4si(29);
1704 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1705
1706 /*
1707 * get the polynom selection mask
1708 * there is one polynom for 0 <= x <= Pi/4
1709 * and another one for Pi/4<x<=Pi/2
1710 * Both branches will be computed.
1711 *
1712 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1713 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1714 */
1715
1716 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1717 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1718 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1719 emm2_3, lp_build_const_v4si(0));
1720
1721 /*
1722 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1723 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1724 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1725 */
1726 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1727 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1728 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1729
1730 /*
1731 * The magic pass: "Extended precision modular arithmetic"
1732 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1733 * xmm1 = _mm_mul_ps(y, xmm1);
1734 * xmm2 = _mm_mul_ps(y, xmm2);
1735 * xmm3 = _mm_mul_ps(y, xmm3);
1736 */
1737 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1738 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1739 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1740
1741 /*
1742 * x = _mm_add_ps(x, xmm1);
1743 * x = _mm_add_ps(x, xmm2);
1744 * x = _mm_add_ps(x, xmm3);
1745 */
1746
1747 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1748 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1749 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1750
1751 /*
1752 * Evaluate the first polynom (0 <= x <= Pi/4)
1753 *
1754 * z = _mm_mul_ps(x,x);
1755 */
1756 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1757
1758 /*
1759 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1760 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1761 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1762 */
1763 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1764 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1765 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1766
1767 /*
1768 * y = *(v4sf*)_ps_coscof_p0;
1769 * y = _mm_mul_ps(y, z);
1770 */
1771 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1772 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1773 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1774 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1775 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1776 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1777
1778
1779 /*
1780 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1781 * y = _mm_sub_ps(y, tmp);
1782 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1783 */
1784 LLVMValueRef half = lp_build_const_v4sf(0.5);
1785 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1786 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1787 LLVMValueRef one = lp_build_const_v4sf(1.0);
1788 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1789
1790 /*
1791 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1792 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1793 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1794 */
1795 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1796 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1797 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1798
1799 /*
1800 * Evaluate the second polynom (Pi/4 <= x <= 0)
1801 *
1802 * y2 = *(v4sf*)_ps_sincof_p0;
1803 * y2 = _mm_mul_ps(y2, z);
1804 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1805 * y2 = _mm_mul_ps(y2, z);
1806 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1807 * y2 = _mm_mul_ps(y2, z);
1808 * y2 = _mm_mul_ps(y2, x);
1809 * y2 = _mm_add_ps(y2, x);
1810 */
1811
1812 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1813 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1814 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1815 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1816 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1817 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1818 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1819
1820 /*
1821 * select the correct result from the two polynoms
1822 * xmm3 = poly_mask;
1823 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1824 * y = _mm_andnot_ps(xmm3, y);
1825 * y = _mm_add_ps(y,y2);
1826 */
1827 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1828 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1829 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1830 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1831 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1832 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1833
1834 /*
1835 * update the sign
1836 * y = _mm_xor_ps(y, sign_bit);
1837 */
1838 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1839 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1840 return y_result;
1841 }
1842
1843
1844 /**
1845 * Generate pow(x, y)
1846 */
1847 LLVMValueRef
1848 lp_build_pow(struct lp_build_context *bld,
1849 LLVMValueRef x,
1850 LLVMValueRef y)
1851 {
1852 /* TODO: optimize the constant case */
1853 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1854 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1855 __FUNCTION__);
1856
1857 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1858 }
1859
1860
1861 /**
1862 * Generate exp(x)
1863 */
1864 LLVMValueRef
1865 lp_build_exp(struct lp_build_context *bld,
1866 LLVMValueRef x)
1867 {
1868 /* log2(e) = 1/log(2) */
1869 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1870
1871 assert(lp_check_value(bld->type, x));
1872
1873 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1874 }
1875
1876
1877 /**
1878 * Generate log(x)
1879 */
1880 LLVMValueRef
1881 lp_build_log(struct lp_build_context *bld,
1882 LLVMValueRef x)
1883 {
1884 /* log(2) */
1885 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1886
1887 assert(lp_check_value(bld->type, x));
1888
1889 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1890 }
1891
1892
1893 /**
1894 * Generate polynomial.
1895 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1896 */
1897 static LLVMValueRef
1898 lp_build_polynomial(struct lp_build_context *bld,
1899 LLVMValueRef x,
1900 const double *coeffs,
1901 unsigned num_coeffs)
1902 {
1903 const struct lp_type type = bld->type;
1904 LLVMValueRef res = NULL;
1905 unsigned i;
1906
1907 assert(lp_check_value(bld->type, x));
1908
1909 /* TODO: optimize the constant case */
1910 if(LLVMIsConstant(x))
1911 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1912 __FUNCTION__);
1913
1914 for (i = num_coeffs; i--; ) {
1915 LLVMValueRef coeff;
1916
1917 coeff = lp_build_const_vec(type, coeffs[i]);
1918
1919 if(res)
1920 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1921 else
1922 res = coeff;
1923 }
1924
1925 if(res)
1926 return res;
1927 else
1928 return bld->undef;
1929 }
1930
1931
1932 /**
1933 * Minimax polynomial fit of 2**x, in range [0, 1[
1934 */
1935 const double lp_build_exp2_polynomial[] = {
1936 #if EXP_POLY_DEGREE == 5
1937 0.999999999690134838155,
1938 0.583974334321735217258,
1939 0.164553105719676828492,
1940 0.0292811063701710962255,
1941 0.00354944426657875141846,
1942 0.000296253726543423377365
1943 #elif EXP_POLY_DEGREE == 4
1944 1.00000001502262084505,
1945 0.563586057338685991394,
1946 0.150436017652442413623,
1947 0.0243220604213317927308,
1948 0.0025359088446580436489
1949 #elif EXP_POLY_DEGREE == 3
1950 0.999925218562710312959,
1951 0.695833540494823811697,
1952 0.226067155427249155588,
1953 0.0780245226406372992967
1954 #elif EXP_POLY_DEGREE == 2
1955 1.00172476321474503578,
1956 0.657636275736077639316,
1957 0.33718943461968720704
1958 #else
1959 #error
1960 #endif
1961 };
1962
1963
1964 void
1965 lp_build_exp2_approx(struct lp_build_context *bld,
1966 LLVMValueRef x,
1967 LLVMValueRef *p_exp2_int_part,
1968 LLVMValueRef *p_frac_part,
1969 LLVMValueRef *p_exp2)
1970 {
1971 const struct lp_type type = bld->type;
1972 LLVMTypeRef vec_type = lp_build_vec_type(type);
1973 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1974 LLVMValueRef ipart = NULL;
1975 LLVMValueRef fpart = NULL;
1976 LLVMValueRef expipart = NULL;
1977 LLVMValueRef expfpart = NULL;
1978 LLVMValueRef res = NULL;
1979
1980 assert(lp_check_value(bld->type, x));
1981
1982 if(p_exp2_int_part || p_frac_part || p_exp2) {
1983 /* TODO: optimize the constant case */
1984 if(LLVMIsConstant(x))
1985 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1986 __FUNCTION__);
1987
1988 assert(type.floating && type.width == 32);
1989
1990 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1991 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1992
1993 /* ipart = floor(x) */
1994 ipart = lp_build_floor(bld, x);
1995
1996 /* fpart = x - ipart */
1997 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1998 }
1999
2000 if(p_exp2_int_part || p_exp2) {
2001 /* expipart = (float) (1 << ipart) */
2002 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
2003 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2004 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
2005 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
2006 }
2007
2008 if(p_exp2) {
2009 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2010 Elements(lp_build_exp2_polynomial));
2011
2012 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2013 }
2014
2015 if(p_exp2_int_part)
2016 *p_exp2_int_part = expipart;
2017
2018 if(p_frac_part)
2019 *p_frac_part = fpart;
2020
2021 if(p_exp2)
2022 *p_exp2 = res;
2023 }
2024
2025
2026 LLVMValueRef
2027 lp_build_exp2(struct lp_build_context *bld,
2028 LLVMValueRef x)
2029 {
2030 LLVMValueRef res;
2031 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2032 return res;
2033 }
2034
2035
2036 /**
2037 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2038 * These coefficients can be generate with
2039 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2040 */
2041 const double lp_build_log2_polynomial[] = {
2042 #if LOG_POLY_DEGREE == 6
2043 3.11578814719469302614,
2044 -3.32419399085241980044,
2045 2.59883907202499966007,
2046 -1.23152682416275988241,
2047 0.318212422185251071475,
2048 -0.0344359067839062357313
2049 #elif LOG_POLY_DEGREE == 5
2050 2.8882704548164776201,
2051 -2.52074962577807006663,
2052 1.48116647521213171641,
2053 -0.465725644288844778798,
2054 0.0596515482674574969533
2055 #elif LOG_POLY_DEGREE == 4
2056 2.61761038894603480148,
2057 -1.75647175389045657003,
2058 0.688243882994381274313,
2059 -0.107254423828329604454
2060 #elif LOG_POLY_DEGREE == 3
2061 2.28330284476918490682,
2062 -1.04913055217340124191,
2063 0.204446009836232697516
2064 #else
2065 #error
2066 #endif
2067 };
2068
2069
2070 /**
2071 * See http://www.devmaster.net/forums/showthread.php?p=43580
2072 */
2073 void
2074 lp_build_log2_approx(struct lp_build_context *bld,
2075 LLVMValueRef x,
2076 LLVMValueRef *p_exp,
2077 LLVMValueRef *p_floor_log2,
2078 LLVMValueRef *p_log2)
2079 {
2080 const struct lp_type type = bld->type;
2081 LLVMTypeRef vec_type = lp_build_vec_type(type);
2082 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2083
2084 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2085 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2086 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2087
2088 LLVMValueRef i = NULL;
2089 LLVMValueRef exp = NULL;
2090 LLVMValueRef mant = NULL;
2091 LLVMValueRef logexp = NULL;
2092 LLVMValueRef logmant = NULL;
2093 LLVMValueRef res = NULL;
2094
2095 assert(lp_check_value(bld->type, x));
2096
2097 if(p_exp || p_floor_log2 || p_log2) {
2098 /* TODO: optimize the constant case */
2099 if(LLVMIsConstant(x))
2100 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2101 __FUNCTION__);
2102
2103 assert(type.floating && type.width == 32);
2104
2105 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2106
2107 /* exp = (float) exponent(x) */
2108 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2109 }
2110
2111 if(p_floor_log2 || p_log2) {
2112 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2113 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2114 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2115 }
2116
2117 if(p_log2) {
2118 /* mant = (float) mantissa(x) */
2119 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2120 mant = LLVMBuildOr(bld->builder, mant, one, "");
2121 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2122
2123 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2124 Elements(lp_build_log2_polynomial));
2125
2126 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2127 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2128
2129 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2130 }
2131
2132 if(p_exp) {
2133 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2134 *p_exp = exp;
2135 }
2136
2137 if(p_floor_log2)
2138 *p_floor_log2 = logexp;
2139
2140 if(p_log2)
2141 *p_log2 = res;
2142 }
2143
2144
2145 LLVMValueRef
2146 lp_build_log2(struct lp_build_context *bld,
2147 LLVMValueRef x)
2148 {
2149 LLVMValueRef res;
2150 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2151 return res;
2152 }