gallivm: Add unorm support to lp_build_lerp()
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_debug.h"
60 #include "lp_bld_arit.h"
61
62
63 #define EXP_POLY_DEGREE 3
64
65 #define LOG_POLY_DEGREE 5
66
67
68 /**
69 * Generate min(a, b)
70 * No checks for special case values of a or b = 1 or 0 are done.
71 */
72 static LLVMValueRef
73 lp_build_min_simple(struct lp_build_context *bld,
74 LLVMValueRef a,
75 LLVMValueRef b)
76 {
77 const struct lp_type type = bld->type;
78 const char *intrinsic = NULL;
79 LLVMValueRef cond;
80
81 assert(lp_check_value(type, a));
82 assert(lp_check_value(type, b));
83
84 /* TODO: optimize the constant case */
85
86 if(type.width * type.length == 128) {
87 if(type.floating) {
88 if(type.width == 32 && util_cpu_caps.has_sse)
89 intrinsic = "llvm.x86.sse.min.ps";
90 if(type.width == 64 && util_cpu_caps.has_sse2)
91 intrinsic = "llvm.x86.sse2.min.pd";
92 }
93 else {
94 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
95 intrinsic = "llvm.x86.sse2.pminu.b";
96 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
97 intrinsic = "llvm.x86.sse41.pminsb";
98 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminuw";
100 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
101 intrinsic = "llvm.x86.sse2.pmins.w";
102 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
103 intrinsic = "llvm.x86.sse41.pminud";
104 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
105 intrinsic = "llvm.x86.sse41.pminsd";
106 }
107 }
108
109 if(intrinsic)
110 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
111
112 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
113 return lp_build_select(bld, cond, a, b);
114 }
115
116
117 /**
118 * Generate max(a, b)
119 * No checks for special case values of a or b = 1 or 0 are done.
120 */
121 static LLVMValueRef
122 lp_build_max_simple(struct lp_build_context *bld,
123 LLVMValueRef a,
124 LLVMValueRef b)
125 {
126 const struct lp_type type = bld->type;
127 const char *intrinsic = NULL;
128 LLVMValueRef cond;
129
130 assert(lp_check_value(type, a));
131 assert(lp_check_value(type, b));
132
133 /* TODO: optimize the constant case */
134
135 if(type.width * type.length == 128) {
136 if(type.floating) {
137 if(type.width == 32 && util_cpu_caps.has_sse)
138 intrinsic = "llvm.x86.sse.max.ps";
139 if(type.width == 64 && util_cpu_caps.has_sse2)
140 intrinsic = "llvm.x86.sse2.max.pd";
141 }
142 else {
143 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
144 intrinsic = "llvm.x86.sse2.pmaxu.b";
145 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
146 intrinsic = "llvm.x86.sse41.pmaxsb";
147 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
148 intrinsic = "llvm.x86.sse41.pmaxuw";
149 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
150 intrinsic = "llvm.x86.sse2.pmaxs.w";
151 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
152 intrinsic = "llvm.x86.sse41.pmaxud";
153 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
154 intrinsic = "llvm.x86.sse41.pmaxsd";
155 }
156 }
157
158 if(intrinsic)
159 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
160
161 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
162 return lp_build_select(bld, cond, a, b);
163 }
164
165
166 /**
167 * Generate 1 - a, or ~a depending on bld->type.
168 */
169 LLVMValueRef
170 lp_build_comp(struct lp_build_context *bld,
171 LLVMValueRef a)
172 {
173 const struct lp_type type = bld->type;
174
175 assert(lp_check_value(type, a));
176
177 if(a == bld->one)
178 return bld->zero;
179 if(a == bld->zero)
180 return bld->one;
181
182 if(type.norm && !type.floating && !type.fixed && !type.sign) {
183 if(LLVMIsConstant(a))
184 return LLVMConstNot(a);
185 else
186 return LLVMBuildNot(bld->builder, a, "");
187 }
188
189 if(LLVMIsConstant(a))
190 if (type.floating)
191 return LLVMConstFSub(bld->one, a);
192 else
193 return LLVMConstSub(bld->one, a);
194 else
195 if (type.floating)
196 return LLVMBuildFSub(bld->builder, bld->one, a, "");
197 else
198 return LLVMBuildSub(bld->builder, bld->one, a, "");
199 }
200
201
202 /**
203 * Generate a + b
204 */
205 LLVMValueRef
206 lp_build_add(struct lp_build_context *bld,
207 LLVMValueRef a,
208 LLVMValueRef b)
209 {
210 const struct lp_type type = bld->type;
211 LLVMValueRef res;
212
213 assert(lp_check_value(type, a));
214 assert(lp_check_value(type, b));
215
216 if(a == bld->zero)
217 return b;
218 if(b == bld->zero)
219 return a;
220 if(a == bld->undef || b == bld->undef)
221 return bld->undef;
222
223 if(bld->type.norm) {
224 const char *intrinsic = NULL;
225
226 if(a == bld->one || b == bld->one)
227 return bld->one;
228
229 if(util_cpu_caps.has_sse2 &&
230 type.width * type.length == 128 &&
231 !type.floating && !type.fixed) {
232 if(type.width == 8)
233 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
234 if(type.width == 16)
235 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
236 }
237
238 if(intrinsic)
239 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
240 }
241
242 if(LLVMIsConstant(a) && LLVMIsConstant(b))
243 if (type.floating)
244 res = LLVMConstFAdd(a, b);
245 else
246 res = LLVMConstAdd(a, b);
247 else
248 if (type.floating)
249 res = LLVMBuildFAdd(bld->builder, a, b, "");
250 else
251 res = LLVMBuildAdd(bld->builder, a, b, "");
252
253 /* clamp to ceiling of 1.0 */
254 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
255 res = lp_build_min_simple(bld, res, bld->one);
256
257 /* XXX clamp to floor of -1 or 0??? */
258
259 return res;
260 }
261
262
263 /** Return the scalar sum of the elements of a */
264 LLVMValueRef
265 lp_build_sum_vector(struct lp_build_context *bld,
266 LLVMValueRef a)
267 {
268 const struct lp_type type = bld->type;
269 LLVMValueRef index, res;
270 unsigned i;
271
272 assert(lp_check_value(type, a));
273
274 if (type.length == 1) {
275 return a;
276 }
277
278 assert(!bld->type.norm);
279
280 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
281 res = LLVMBuildExtractElement(bld->builder, a, index, "");
282
283 for (i = 1; i < type.length; i++) {
284 index = LLVMConstInt(LLVMInt32Type(), i, 0);
285 if (type.floating)
286 res = LLVMBuildFAdd(bld->builder, res,
287 LLVMBuildExtractElement(bld->builder,
288 a, index, ""),
289 "");
290 else
291 res = LLVMBuildAdd(bld->builder, res,
292 LLVMBuildExtractElement(bld->builder,
293 a, index, ""),
294 "");
295 }
296
297 return res;
298 }
299
300
301 /**
302 * Generate a - b
303 */
304 LLVMValueRef
305 lp_build_sub(struct lp_build_context *bld,
306 LLVMValueRef a,
307 LLVMValueRef b)
308 {
309 const struct lp_type type = bld->type;
310 LLVMValueRef res;
311
312 assert(lp_check_value(type, a));
313 assert(lp_check_value(type, b));
314
315 if(b == bld->zero)
316 return a;
317 if(a == bld->undef || b == bld->undef)
318 return bld->undef;
319 if(a == b)
320 return bld->zero;
321
322 if(bld->type.norm) {
323 const char *intrinsic = NULL;
324
325 if(b == bld->one)
326 return bld->zero;
327
328 if(util_cpu_caps.has_sse2 &&
329 type.width * type.length == 128 &&
330 !type.floating && !type.fixed) {
331 if(type.width == 8)
332 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
333 if(type.width == 16)
334 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
335 }
336
337 if(intrinsic)
338 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
339 }
340
341 if(LLVMIsConstant(a) && LLVMIsConstant(b))
342 if (type.floating)
343 res = LLVMConstFSub(a, b);
344 else
345 res = LLVMConstSub(a, b);
346 else
347 if (type.floating)
348 res = LLVMBuildFSub(bld->builder, a, b, "");
349 else
350 res = LLVMBuildSub(bld->builder, a, b, "");
351
352 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
353 res = lp_build_max_simple(bld, res, bld->zero);
354
355 return res;
356 }
357
358
359 /**
360 * Normalized 8bit multiplication.
361 *
362 * - alpha plus one
363 *
364 * makes the following approximation to the division (Sree)
365 *
366 * a*b/255 ~= (a*(b + 1)) >> 256
367 *
368 * which is the fastest method that satisfies the following OpenGL criteria
369 *
370 * 0*0 = 0 and 255*255 = 255
371 *
372 * - geometric series
373 *
374 * takes the geometric series approximation to the division
375 *
376 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
377 *
378 * in this case just the first two terms to fit in 16bit arithmetic
379 *
380 * t/255 ~= (t + (t >> 8)) >> 8
381 *
382 * note that just by itself it doesn't satisfies the OpenGL criteria, as
383 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
384 * must be used
385 *
386 * - geometric series plus rounding
387 *
388 * when using a geometric series division instead of truncating the result
389 * use roundoff in the approximation (Jim Blinn)
390 *
391 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
392 *
393 * achieving the exact results
394 *
395 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
396 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
397 * @sa Michael Herf, The "double blend trick", May 2000,
398 * http://www.stereopsis.com/doubleblend.html
399 */
400 static LLVMValueRef
401 lp_build_mul_u8n(LLVMBuilderRef builder,
402 struct lp_type i16_type,
403 LLVMValueRef a, LLVMValueRef b)
404 {
405 LLVMValueRef c8;
406 LLVMValueRef ab;
407
408 assert(!i16_type.floating);
409 assert(lp_check_value(i16_type, a));
410 assert(lp_check_value(i16_type, b));
411
412 c8 = lp_build_const_int_vec(i16_type, 8);
413
414 #if 0
415
416 /* a*b/255 ~= (a*(b + 1)) >> 256 */
417 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
418 ab = LLVMBuildMul(builder, a, b, "");
419
420 #else
421
422 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
423 ab = LLVMBuildMul(builder, a, b, "");
424 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
425 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
426
427 #endif
428
429 ab = LLVMBuildLShr(builder, ab, c8, "");
430
431 return ab;
432 }
433
434
435 /**
436 * Generate a * b
437 */
438 LLVMValueRef
439 lp_build_mul(struct lp_build_context *bld,
440 LLVMValueRef a,
441 LLVMValueRef b)
442 {
443 const struct lp_type type = bld->type;
444 LLVMValueRef shift;
445 LLVMValueRef res;
446
447 assert(lp_check_value(type, a));
448 assert(lp_check_value(type, b));
449
450 if(a == bld->zero)
451 return bld->zero;
452 if(a == bld->one)
453 return b;
454 if(b == bld->zero)
455 return bld->zero;
456 if(b == bld->one)
457 return a;
458 if(a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if(!type.floating && !type.fixed && type.norm) {
462 if(type.width == 8) {
463 struct lp_type i16_type = lp_wider_type(type);
464 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
465
466 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
467 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
468
469 /* PMULLW, PSRLW, PADDW */
470 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
471 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
472
473 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
474
475 return ab;
476 }
477
478 /* FIXME */
479 assert(0);
480 }
481
482 if(type.fixed)
483 shift = lp_build_const_int_vec(type, type.width/2);
484 else
485 shift = NULL;
486
487 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
488 if (type.floating)
489 res = LLVMConstFMul(a, b);
490 else
491 res = LLVMConstMul(a, b);
492 if(shift) {
493 if(type.sign)
494 res = LLVMConstAShr(res, shift);
495 else
496 res = LLVMConstLShr(res, shift);
497 }
498 }
499 else {
500 if (type.floating)
501 res = LLVMBuildFMul(bld->builder, a, b, "");
502 else
503 res = LLVMBuildMul(bld->builder, a, b, "");
504 if(shift) {
505 if(type.sign)
506 res = LLVMBuildAShr(bld->builder, res, shift, "");
507 else
508 res = LLVMBuildLShr(bld->builder, res, shift, "");
509 }
510 }
511
512 return res;
513 }
514
515
516 /**
517 * Small vector x scale multiplication optimization.
518 */
519 LLVMValueRef
520 lp_build_mul_imm(struct lp_build_context *bld,
521 LLVMValueRef a,
522 int b)
523 {
524 LLVMValueRef factor;
525
526 assert(lp_check_value(bld->type, a));
527
528 if(b == 0)
529 return bld->zero;
530
531 if(b == 1)
532 return a;
533
534 if(b == -1)
535 return lp_build_negate(bld, a);
536
537 if(b == 2 && bld->type.floating)
538 return lp_build_add(bld, a, a);
539
540 if(util_is_power_of_two(b)) {
541 unsigned shift = ffs(b) - 1;
542
543 if(bld->type.floating) {
544 #if 0
545 /*
546 * Power of two multiplication by directly manipulating the mantissa.
547 *
548 * XXX: This might not be always faster, it will introduce a small error
549 * for multiplication by zero, and it will produce wrong results
550 * for Inf and NaN.
551 */
552 unsigned mantissa = lp_mantissa(bld->type);
553 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
554 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
555 a = LLVMBuildAdd(bld->builder, a, factor, "");
556 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
557 return a;
558 #endif
559 }
560 else {
561 factor = lp_build_const_vec(bld->type, shift);
562 return LLVMBuildShl(bld->builder, a, factor, "");
563 }
564 }
565
566 factor = lp_build_const_vec(bld->type, (double)b);
567 return lp_build_mul(bld, a, factor);
568 }
569
570
571 /**
572 * Generate a / b
573 */
574 LLVMValueRef
575 lp_build_div(struct lp_build_context *bld,
576 LLVMValueRef a,
577 LLVMValueRef b)
578 {
579 const struct lp_type type = bld->type;
580
581 assert(lp_check_value(type, a));
582 assert(lp_check_value(type, b));
583
584 if(a == bld->zero)
585 return bld->zero;
586 if(a == bld->one)
587 return lp_build_rcp(bld, b);
588 if(b == bld->zero)
589 return bld->undef;
590 if(b == bld->one)
591 return a;
592 if(a == bld->undef || b == bld->undef)
593 return bld->undef;
594
595 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
596 if (type.floating)
597 return LLVMConstFDiv(a, b);
598 else if (type.sign)
599 return LLVMConstSDiv(a, b);
600 else
601 return LLVMConstUDiv(a, b);
602 }
603
604 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
605 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
606
607 if (type.floating)
608 return LLVMBuildFDiv(bld->builder, a, b, "");
609 else if (type.sign)
610 return LLVMBuildSDiv(bld->builder, a, b, "");
611 else
612 return LLVMBuildUDiv(bld->builder, a, b, "");
613 }
614
615
616 /**
617 * Linear interpolation -- without any checks.
618 *
619 * @sa http://www.stereopsis.com/doubleblend.html
620 */
621 static INLINE LLVMValueRef
622 lp_build_lerp_simple(struct lp_build_context *bld,
623 LLVMValueRef x,
624 LLVMValueRef v0,
625 LLVMValueRef v1)
626 {
627 LLVMValueRef delta;
628 LLVMValueRef res;
629
630 assert(lp_check_value(bld->type, x));
631 assert(lp_check_value(bld->type, v0));
632 assert(lp_check_value(bld->type, v1));
633
634 delta = lp_build_sub(bld, v1, v0);
635
636 res = lp_build_mul(bld, x, delta);
637
638 res = lp_build_add(bld, v0, res);
639
640 if (bld->type.fixed) {
641 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
642 * but it will be wrong for other uses. Basically we need a more
643 * powerful lp_type, capable of further distinguishing the values
644 * interpretation from the value storage. */
645 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
646 }
647
648 return res;
649 }
650
651
652 /**
653 * Linear interpolation.
654 */
655 LLVMValueRef
656 lp_build_lerp(struct lp_build_context *bld,
657 LLVMValueRef x,
658 LLVMValueRef v0,
659 LLVMValueRef v1)
660 {
661 const struct lp_type type = bld->type;
662 LLVMValueRef res;
663
664 assert(lp_check_value(type, x));
665 assert(lp_check_value(type, v0));
666 assert(lp_check_value(type, v1));
667
668 if (type.norm) {
669 struct lp_type wide_type;
670 struct lp_build_context wide_bld;
671 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
672 LLVMValueRef shift;
673
674 assert(type.length >= 2);
675 assert(!type.sign);
676
677 /*
678 * Create a wider type, enough to hold the intermediate result of the
679 * multiplication.
680 */
681 memset(&wide_type, 0, sizeof wide_type);
682 wide_type.fixed = TRUE;
683 wide_type.width = type.width*2;
684 wide_type.length = type.length/2;
685
686 lp_build_context_init(&wide_bld, bld->builder, wide_type);
687
688 lp_build_unpack2(bld->builder, type, wide_type, x, &xl, &xh);
689 lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h);
690 lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h);
691
692 /*
693 * Scale x from [0, 255] to [0, 256]
694 */
695
696 shift = lp_build_const_int_vec(wide_type, type.width - 1);
697
698 xl = lp_build_add(&wide_bld, xl,
699 LLVMBuildAShr(bld->builder, xl, shift, ""));
700 xh = lp_build_add(&wide_bld, xh,
701 LLVMBuildAShr(bld->builder, xh, shift, ""));
702
703 /*
704 * Lerp both halves.
705 */
706
707 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
708 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
709
710 res = lp_build_pack2(bld->builder, wide_type, type, resl, resh);
711 } else {
712 res = lp_build_lerp_simple(bld, x, v0, v1);
713 }
714
715 return res;
716 }
717
718
719 LLVMValueRef
720 lp_build_lerp_2d(struct lp_build_context *bld,
721 LLVMValueRef x,
722 LLVMValueRef y,
723 LLVMValueRef v00,
724 LLVMValueRef v01,
725 LLVMValueRef v10,
726 LLVMValueRef v11)
727 {
728 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
729 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
730 return lp_build_lerp(bld, y, v0, v1);
731 }
732
733
734 /**
735 * Generate min(a, b)
736 * Do checks for special cases.
737 */
738 LLVMValueRef
739 lp_build_min(struct lp_build_context *bld,
740 LLVMValueRef a,
741 LLVMValueRef b)
742 {
743 assert(lp_check_value(bld->type, a));
744 assert(lp_check_value(bld->type, b));
745
746 if(a == bld->undef || b == bld->undef)
747 return bld->undef;
748
749 if(a == b)
750 return a;
751
752 if(bld->type.norm) {
753 if(a == bld->zero || b == bld->zero)
754 return bld->zero;
755 if(a == bld->one)
756 return b;
757 if(b == bld->one)
758 return a;
759 }
760
761 return lp_build_min_simple(bld, a, b);
762 }
763
764
765 /**
766 * Generate max(a, b)
767 * Do checks for special cases.
768 */
769 LLVMValueRef
770 lp_build_max(struct lp_build_context *bld,
771 LLVMValueRef a,
772 LLVMValueRef b)
773 {
774 assert(lp_check_value(bld->type, a));
775 assert(lp_check_value(bld->type, b));
776
777 if(a == bld->undef || b == bld->undef)
778 return bld->undef;
779
780 if(a == b)
781 return a;
782
783 if(bld->type.norm) {
784 if(a == bld->one || b == bld->one)
785 return bld->one;
786 if(a == bld->zero)
787 return b;
788 if(b == bld->zero)
789 return a;
790 }
791
792 return lp_build_max_simple(bld, a, b);
793 }
794
795
796 /**
797 * Generate clamp(a, min, max)
798 * Do checks for special cases.
799 */
800 LLVMValueRef
801 lp_build_clamp(struct lp_build_context *bld,
802 LLVMValueRef a,
803 LLVMValueRef min,
804 LLVMValueRef max)
805 {
806 assert(lp_check_value(bld->type, a));
807 assert(lp_check_value(bld->type, min));
808 assert(lp_check_value(bld->type, max));
809
810 a = lp_build_min(bld, a, max);
811 a = lp_build_max(bld, a, min);
812 return a;
813 }
814
815
816 /**
817 * Generate abs(a)
818 */
819 LLVMValueRef
820 lp_build_abs(struct lp_build_context *bld,
821 LLVMValueRef a)
822 {
823 const struct lp_type type = bld->type;
824 LLVMTypeRef vec_type = lp_build_vec_type(type);
825
826 assert(lp_check_value(type, a));
827
828 if(!type.sign)
829 return a;
830
831 if(type.floating) {
832 /* Mask out the sign bit */
833 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
834 unsigned long long absMask = ~(1ULL << (type.width - 1));
835 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
836 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
837 a = LLVMBuildAnd(bld->builder, a, mask, "");
838 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
839 return a;
840 }
841
842 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
843 switch(type.width) {
844 case 8:
845 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
846 case 16:
847 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
848 case 32:
849 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
850 }
851 }
852
853 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
854 }
855
856
857 LLVMValueRef
858 lp_build_negate(struct lp_build_context *bld,
859 LLVMValueRef a)
860 {
861 assert(lp_check_value(bld->type, a));
862
863 #if HAVE_LLVM >= 0x0207
864 if (bld->type.floating)
865 a = LLVMBuildFNeg(bld->builder, a, "");
866 else
867 #endif
868 a = LLVMBuildNeg(bld->builder, a, "");
869
870 return a;
871 }
872
873
874 /** Return -1, 0 or +1 depending on the sign of a */
875 LLVMValueRef
876 lp_build_sgn(struct lp_build_context *bld,
877 LLVMValueRef a)
878 {
879 const struct lp_type type = bld->type;
880 LLVMValueRef cond;
881 LLVMValueRef res;
882
883 assert(lp_check_value(type, a));
884
885 /* Handle non-zero case */
886 if(!type.sign) {
887 /* if not zero then sign must be positive */
888 res = bld->one;
889 }
890 else if(type.floating) {
891 LLVMTypeRef vec_type;
892 LLVMTypeRef int_type;
893 LLVMValueRef mask;
894 LLVMValueRef sign;
895 LLVMValueRef one;
896 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
897
898 int_type = lp_build_int_vec_type(type);
899 vec_type = lp_build_vec_type(type);
900 mask = lp_build_const_int_vec(type, maskBit);
901
902 /* Take the sign bit and add it to 1 constant */
903 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
904 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
905 one = LLVMConstBitCast(bld->one, int_type);
906 res = LLVMBuildOr(bld->builder, sign, one, "");
907 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
908 }
909 else
910 {
911 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
912 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
913 res = lp_build_select(bld, cond, bld->one, minus_one);
914 }
915
916 /* Handle zero */
917 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
918 res = lp_build_select(bld, cond, bld->zero, res);
919
920 return res;
921 }
922
923
924 /**
925 * Set the sign of float vector 'a' according to 'sign'.
926 * If sign==0, return abs(a).
927 * If sign==1, return -abs(a);
928 * Other values for sign produce undefined results.
929 */
930 LLVMValueRef
931 lp_build_set_sign(struct lp_build_context *bld,
932 LLVMValueRef a, LLVMValueRef sign)
933 {
934 const struct lp_type type = bld->type;
935 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
936 LLVMTypeRef vec_type = lp_build_vec_type(type);
937 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
938 LLVMValueRef mask = lp_build_const_int_vec(type,
939 ~((unsigned long long) 1 << (type.width - 1)));
940 LLVMValueRef val, res;
941
942 assert(type.floating);
943 assert(lp_check_value(type, a));
944
945 /* val = reinterpret_cast<int>(a) */
946 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
947 /* val = val & mask */
948 val = LLVMBuildAnd(bld->builder, val, mask, "");
949 /* sign = sign << shift */
950 sign = LLVMBuildShl(bld->builder, sign, shift, "");
951 /* res = val | sign */
952 res = LLVMBuildOr(bld->builder, val, sign, "");
953 /* res = reinterpret_cast<float>(res) */
954 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
955
956 return res;
957 }
958
959
960 /**
961 * Convert vector of (or scalar) int to vector of (or scalar) float.
962 */
963 LLVMValueRef
964 lp_build_int_to_float(struct lp_build_context *bld,
965 LLVMValueRef a)
966 {
967 const struct lp_type type = bld->type;
968 LLVMTypeRef vec_type = lp_build_vec_type(type);
969
970 assert(type.floating);
971
972 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
973 }
974
975
976
977 enum lp_build_round_sse41_mode
978 {
979 LP_BUILD_ROUND_SSE41_NEAREST = 0,
980 LP_BUILD_ROUND_SSE41_FLOOR = 1,
981 LP_BUILD_ROUND_SSE41_CEIL = 2,
982 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
983 };
984
985
986 static INLINE LLVMValueRef
987 lp_build_round_sse41(struct lp_build_context *bld,
988 LLVMValueRef a,
989 enum lp_build_round_sse41_mode mode)
990 {
991 const struct lp_type type = bld->type;
992 LLVMTypeRef vec_type = lp_build_vec_type(type);
993 const char *intrinsic;
994
995 assert(type.floating);
996 assert(type.width*type.length == 128);
997 assert(lp_check_value(type, a));
998 assert(util_cpu_caps.has_sse4_1);
999
1000 switch(type.width) {
1001 case 32:
1002 intrinsic = "llvm.x86.sse41.round.ps";
1003 break;
1004 case 64:
1005 intrinsic = "llvm.x86.sse41.round.pd";
1006 break;
1007 default:
1008 assert(0);
1009 return bld->undef;
1010 }
1011
1012 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
1013 LLVMConstInt(LLVMInt32Type(), mode, 0));
1014 }
1015
1016
1017 /**
1018 * Return the integer part of a float (vector) value. The returned value is
1019 * a float (vector).
1020 * Ex: trunc(-1.5) = 1.0
1021 */
1022 LLVMValueRef
1023 lp_build_trunc(struct lp_build_context *bld,
1024 LLVMValueRef a)
1025 {
1026 const struct lp_type type = bld->type;
1027
1028 assert(type.floating);
1029 assert(lp_check_value(type, a));
1030
1031 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1032 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1033 else {
1034 LLVMTypeRef vec_type = lp_build_vec_type(type);
1035 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1036 LLVMValueRef res;
1037 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1038 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1039 return res;
1040 }
1041 }
1042
1043
1044 /**
1045 * Return float (vector) rounded to nearest integer (vector). The returned
1046 * value is a float (vector).
1047 * Ex: round(0.9) = 1.0
1048 * Ex: round(-1.5) = -2.0
1049 */
1050 LLVMValueRef
1051 lp_build_round(struct lp_build_context *bld,
1052 LLVMValueRef a)
1053 {
1054 const struct lp_type type = bld->type;
1055
1056 assert(type.floating);
1057 assert(lp_check_value(type, a));
1058
1059 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1060 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1061 else {
1062 LLVMTypeRef vec_type = lp_build_vec_type(type);
1063 LLVMValueRef res;
1064 res = lp_build_iround(bld, a);
1065 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1066 return res;
1067 }
1068 }
1069
1070
1071 /**
1072 * Return floor of float (vector), result is a float (vector)
1073 * Ex: floor(1.1) = 1.0
1074 * Ex: floor(-1.1) = -2.0
1075 */
1076 LLVMValueRef
1077 lp_build_floor(struct lp_build_context *bld,
1078 LLVMValueRef a)
1079 {
1080 const struct lp_type type = bld->type;
1081
1082 assert(type.floating);
1083 assert(lp_check_value(type, a));
1084
1085 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1086 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1087 else {
1088 LLVMTypeRef vec_type = lp_build_vec_type(type);
1089 LLVMValueRef res;
1090 res = lp_build_ifloor(bld, a);
1091 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1092 return res;
1093 }
1094 }
1095
1096
1097 /**
1098 * Return ceiling of float (vector), returning float (vector).
1099 * Ex: ceil( 1.1) = 2.0
1100 * Ex: ceil(-1.1) = -1.0
1101 */
1102 LLVMValueRef
1103 lp_build_ceil(struct lp_build_context *bld,
1104 LLVMValueRef a)
1105 {
1106 const struct lp_type type = bld->type;
1107
1108 assert(type.floating);
1109 assert(lp_check_value(type, a));
1110
1111 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1112 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1113 else {
1114 LLVMTypeRef vec_type = lp_build_vec_type(type);
1115 LLVMValueRef res;
1116 res = lp_build_iceil(bld, a);
1117 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1118 return res;
1119 }
1120 }
1121
1122
1123 /**
1124 * Return fractional part of 'a' computed as a - floor(a)
1125 * Typically used in texture coord arithmetic.
1126 */
1127 LLVMValueRef
1128 lp_build_fract(struct lp_build_context *bld,
1129 LLVMValueRef a)
1130 {
1131 assert(bld->type.floating);
1132 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1133 }
1134
1135
1136 /**
1137 * Return the integer part of a float (vector) value. The returned value is
1138 * an integer (vector).
1139 * Ex: itrunc(-1.5) = 1
1140 */
1141 LLVMValueRef
1142 lp_build_itrunc(struct lp_build_context *bld,
1143 LLVMValueRef a)
1144 {
1145 const struct lp_type type = bld->type;
1146 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1147
1148 assert(type.floating);
1149 assert(lp_check_value(type, a));
1150
1151 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1152 }
1153
1154
1155 /**
1156 * Return float (vector) rounded to nearest integer (vector). The returned
1157 * value is an integer (vector).
1158 * Ex: iround(0.9) = 1
1159 * Ex: iround(-1.5) = -2
1160 */
1161 LLVMValueRef
1162 lp_build_iround(struct lp_build_context *bld,
1163 LLVMValueRef a)
1164 {
1165 const struct lp_type type = bld->type;
1166 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1167 LLVMValueRef res;
1168
1169 assert(type.floating);
1170
1171 assert(lp_check_value(type, a));
1172
1173 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1174 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1175 }
1176 else {
1177 LLVMTypeRef vec_type = lp_build_vec_type(type);
1178 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1179 LLVMValueRef sign;
1180 LLVMValueRef half;
1181
1182 /* get sign bit */
1183 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1184 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1185
1186 /* sign * 0.5 */
1187 half = lp_build_const_vec(type, 0.5);
1188 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1189 half = LLVMBuildOr(bld->builder, sign, half, "");
1190 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1191
1192 res = LLVMBuildFAdd(bld->builder, a, half, "");
1193 }
1194
1195 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1196
1197 return res;
1198 }
1199
1200
1201 /**
1202 * Return floor of float (vector), result is an int (vector)
1203 * Ex: ifloor(1.1) = 1.0
1204 * Ex: ifloor(-1.1) = -2.0
1205 */
1206 LLVMValueRef
1207 lp_build_ifloor(struct lp_build_context *bld,
1208 LLVMValueRef a)
1209 {
1210 const struct lp_type type = bld->type;
1211 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1212 LLVMValueRef res;
1213
1214 assert(type.floating);
1215 assert(lp_check_value(type, a));
1216
1217 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1218 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1219 }
1220 else {
1221 /* Take the sign bit and add it to 1 constant */
1222 LLVMTypeRef vec_type = lp_build_vec_type(type);
1223 unsigned mantissa = lp_mantissa(type);
1224 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1225 LLVMValueRef sign;
1226 LLVMValueRef offset;
1227
1228 /* sign = a < 0 ? ~0 : 0 */
1229 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1230 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1231 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1232
1233 /* offset = -0.99999(9)f */
1234 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1235 offset = LLVMConstBitCast(offset, int_vec_type);
1236
1237 /* offset = a < 0 ? offset : 0.0f */
1238 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1239 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1240
1241 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1242 }
1243
1244 /* round to nearest (toward zero) */
1245 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1246
1247 return res;
1248 }
1249
1250
1251 /**
1252 * Return ceiling of float (vector), returning int (vector).
1253 * Ex: iceil( 1.1) = 2
1254 * Ex: iceil(-1.1) = -1
1255 */
1256 LLVMValueRef
1257 lp_build_iceil(struct lp_build_context *bld,
1258 LLVMValueRef a)
1259 {
1260 const struct lp_type type = bld->type;
1261 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1262 LLVMValueRef res;
1263
1264 assert(type.floating);
1265 assert(lp_check_value(type, a));
1266
1267 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1268 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1269 }
1270 else {
1271 LLVMTypeRef vec_type = lp_build_vec_type(type);
1272 unsigned mantissa = lp_mantissa(type);
1273 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1274 LLVMValueRef sign;
1275 LLVMValueRef offset;
1276
1277 /* sign = a < 0 ? 0 : ~0 */
1278 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1279 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1280 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1281 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1282
1283 /* offset = 0.99999(9)f */
1284 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1285 offset = LLVMConstBitCast(offset, int_vec_type);
1286
1287 /* offset = a < 0 ? 0.0 : offset */
1288 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1289 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1290
1291 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1292 }
1293
1294 /* round to nearest (toward zero) */
1295 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1296
1297 return res;
1298 }
1299
1300
1301 LLVMValueRef
1302 lp_build_sqrt(struct lp_build_context *bld,
1303 LLVMValueRef a)
1304 {
1305 const struct lp_type type = bld->type;
1306 LLVMTypeRef vec_type = lp_build_vec_type(type);
1307 char intrinsic[32];
1308
1309 assert(lp_check_value(type, a));
1310
1311 /* TODO: optimize the constant case */
1312 /* TODO: optimize the constant case */
1313
1314 assert(type.floating);
1315 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1316
1317 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1318 }
1319
1320
1321 /**
1322 * Do one Newton-Raphson step to improve reciprocate precision:
1323 *
1324 * x_{i+1} = x_i * (2 - a * x_i)
1325 *
1326 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1327 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1328 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1329 * halo. It would be necessary to clamp the argument to prevent this.
1330 *
1331 * See also:
1332 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1333 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1334 */
1335 static INLINE LLVMValueRef
1336 lp_build_rcp_refine(struct lp_build_context *bld,
1337 LLVMValueRef a,
1338 LLVMValueRef rcp_a)
1339 {
1340 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1341 LLVMValueRef res;
1342
1343 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1344 res = LLVMBuildFSub(bld->builder, two, res, "");
1345 res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1346
1347 return res;
1348 }
1349
1350
1351 LLVMValueRef
1352 lp_build_rcp(struct lp_build_context *bld,
1353 LLVMValueRef a)
1354 {
1355 const struct lp_type type = bld->type;
1356
1357 assert(lp_check_value(type, a));
1358
1359 if(a == bld->zero)
1360 return bld->undef;
1361 if(a == bld->one)
1362 return bld->one;
1363 if(a == bld->undef)
1364 return bld->undef;
1365
1366 assert(type.floating);
1367
1368 if(LLVMIsConstant(a))
1369 return LLVMConstFDiv(bld->one, a);
1370
1371 /*
1372 * We don't use RCPPS because:
1373 * - it only has 10bits of precision
1374 * - it doesn't even get the reciprocate of 1.0 exactly
1375 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1376 * - for recent processors the benefit over DIVPS is marginal, a case
1377 * depedent
1378 *
1379 * We could still use it on certain processors if benchmarks show that the
1380 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1381 * particular uses that require less workarounds.
1382 */
1383
1384 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1385 const unsigned num_iterations = 0;
1386 LLVMValueRef res;
1387 unsigned i;
1388
1389 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1390
1391 for (i = 0; i < num_iterations; ++i) {
1392 res = lp_build_rcp_refine(bld, a, res);
1393 }
1394
1395 return res;
1396 }
1397
1398 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1399 }
1400
1401
1402 /**
1403 * Do one Newton-Raphson step to improve rsqrt precision:
1404 *
1405 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1406 *
1407 * See also:
1408 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1409 */
1410 static INLINE LLVMValueRef
1411 lp_build_rsqrt_refine(struct lp_build_context *bld,
1412 LLVMValueRef a,
1413 LLVMValueRef rsqrt_a)
1414 {
1415 LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1416 LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1417 LLVMValueRef res;
1418
1419 res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1420 res = LLVMBuildFMul(bld->builder, a, res, "");
1421 res = LLVMBuildFSub(bld->builder, three, res, "");
1422 res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1423 res = LLVMBuildFMul(bld->builder, half, res, "");
1424
1425 return res;
1426 }
1427
1428
1429 /**
1430 * Generate 1/sqrt(a)
1431 */
1432 LLVMValueRef
1433 lp_build_rsqrt(struct lp_build_context *bld,
1434 LLVMValueRef a)
1435 {
1436 const struct lp_type type = bld->type;
1437
1438 assert(lp_check_value(type, a));
1439
1440 assert(type.floating);
1441
1442 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1443 const unsigned num_iterations = 0;
1444 LLVMValueRef res;
1445 unsigned i;
1446
1447 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1448
1449 for (i = 0; i < num_iterations; ++i) {
1450 res = lp_build_rsqrt_refine(bld, a, res);
1451 }
1452
1453 return res;
1454 }
1455
1456 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1457 }
1458
1459
1460 static inline LLVMValueRef
1461 lp_build_const_v4si(unsigned long value)
1462 {
1463 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1464 LLVMValueRef elements[4] = { element, element, element, element };
1465 return LLVMConstVector(elements, 4);
1466 }
1467
1468 static inline LLVMValueRef
1469 lp_build_const_v4sf(float value)
1470 {
1471 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1472 LLVMValueRef elements[4] = { element, element, element, element };
1473 return LLVMConstVector(elements, 4);
1474 }
1475
1476
1477 /**
1478 * Generate sin(a) using SSE2
1479 */
1480 LLVMValueRef
1481 lp_build_sin(struct lp_build_context *bld,
1482 LLVMValueRef a)
1483 {
1484 struct lp_type int_type = lp_int_type(bld->type);
1485 LLVMBuilderRef b = bld->builder;
1486 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1487 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1488
1489 /*
1490 * take the absolute value,
1491 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1492 */
1493
1494 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1495 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1496
1497 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1498 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1499
1500 /*
1501 * extract the sign bit (upper one)
1502 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1503 */
1504 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1505 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1506
1507 /*
1508 * scale by 4/Pi
1509 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1510 */
1511
1512 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1513 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1514
1515 /*
1516 * store the integer part of y in mm0
1517 * emm2 = _mm_cvttps_epi32(y);
1518 */
1519
1520 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1521
1522 /*
1523 * j=(j+1) & (~1) (see the cephes sources)
1524 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1525 */
1526
1527 LLVMValueRef all_one = lp_build_const_v4si(1);
1528 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1529 /*
1530 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1531 */
1532 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1533 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1534
1535 /*
1536 * y = _mm_cvtepi32_ps(emm2);
1537 */
1538 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1539
1540 /* get the swap sign flag
1541 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1542 */
1543 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1544 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1545
1546 /*
1547 * emm2 = _mm_slli_epi32(emm0, 29);
1548 */
1549 LLVMValueRef const_29 = lp_build_const_v4si(29);
1550 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1551
1552 /*
1553 * get the polynom selection mask
1554 * there is one polynom for 0 <= x <= Pi/4
1555 * and another one for Pi/4<x<=Pi/2
1556 * Both branches will be computed.
1557 *
1558 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1559 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1560 */
1561
1562 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1563 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1564 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1565 emm2_3, lp_build_const_v4si(0));
1566 /*
1567 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1568 */
1569 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1570
1571 /*
1572 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1573 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1574 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1575 */
1576 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1577 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1578 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1579
1580 /*
1581 * The magic pass: "Extended precision modular arithmetic"
1582 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1583 * xmm1 = _mm_mul_ps(y, xmm1);
1584 * xmm2 = _mm_mul_ps(y, xmm2);
1585 * xmm3 = _mm_mul_ps(y, xmm3);
1586 */
1587 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1588 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1589 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1590
1591 /*
1592 * x = _mm_add_ps(x, xmm1);
1593 * x = _mm_add_ps(x, xmm2);
1594 * x = _mm_add_ps(x, xmm3);
1595 */
1596
1597 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1598 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1599 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1600
1601 /*
1602 * Evaluate the first polynom (0 <= x <= Pi/4)
1603 *
1604 * z = _mm_mul_ps(x,x);
1605 */
1606 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1607
1608 /*
1609 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1610 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1611 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1612 */
1613 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1614 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1615 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1616
1617 /*
1618 * y = *(v4sf*)_ps_coscof_p0;
1619 * y = _mm_mul_ps(y, z);
1620 */
1621 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1622 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1623 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1624 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1625 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1626 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1627
1628
1629 /*
1630 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1631 * y = _mm_sub_ps(y, tmp);
1632 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1633 */
1634 LLVMValueRef half = lp_build_const_v4sf(0.5);
1635 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1636 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1637 LLVMValueRef one = lp_build_const_v4sf(1.0);
1638 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1639
1640 /*
1641 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1642 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1643 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1644 */
1645 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1646 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1647 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1648
1649 /*
1650 * Evaluate the second polynom (Pi/4 <= x <= 0)
1651 *
1652 * y2 = *(v4sf*)_ps_sincof_p0;
1653 * y2 = _mm_mul_ps(y2, z);
1654 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1655 * y2 = _mm_mul_ps(y2, z);
1656 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1657 * y2 = _mm_mul_ps(y2, z);
1658 * y2 = _mm_mul_ps(y2, x);
1659 * y2 = _mm_add_ps(y2, x);
1660 */
1661
1662 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1663 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1664 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1665 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1666 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1667 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1668 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1669
1670 /*
1671 * select the correct result from the two polynoms
1672 * xmm3 = poly_mask;
1673 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1674 * y = _mm_andnot_ps(xmm3, y);
1675 * y = _mm_add_ps(y,y2);
1676 */
1677 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1678 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1679 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1680 LLVMValueRef inv = lp_build_const_v4si(~0);
1681 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1682 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1683 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1684
1685 /*
1686 * update the sign
1687 * y = _mm_xor_ps(y, sign_bit);
1688 */
1689 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1690 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1691 return y_result;
1692 }
1693
1694
1695 /**
1696 * Generate cos(a) using SSE2
1697 */
1698 LLVMValueRef
1699 lp_build_cos(struct lp_build_context *bld,
1700 LLVMValueRef a)
1701 {
1702 struct lp_type int_type = lp_int_type(bld->type);
1703 LLVMBuilderRef b = bld->builder;
1704 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1705 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1706
1707 /*
1708 * take the absolute value,
1709 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1710 */
1711
1712 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1713 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1714
1715 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1716 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1717
1718 /*
1719 * scale by 4/Pi
1720 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1721 */
1722
1723 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1724 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1725
1726 /*
1727 * store the integer part of y in mm0
1728 * emm2 = _mm_cvttps_epi32(y);
1729 */
1730
1731 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1732
1733 /*
1734 * j=(j+1) & (~1) (see the cephes sources)
1735 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1736 */
1737
1738 LLVMValueRef all_one = lp_build_const_v4si(1);
1739 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1740 /*
1741 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1742 */
1743 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1744 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1745
1746 /*
1747 * y = _mm_cvtepi32_ps(emm2);
1748 */
1749 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1750
1751
1752 /*
1753 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1754 */
1755 LLVMValueRef const_2 = lp_build_const_v4si(2);
1756 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1757
1758
1759 /* get the swap sign flag
1760 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1761 */
1762 LLVMValueRef inv = lp_build_const_v4si(~0);
1763 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1764 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1765 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1766
1767 /*
1768 * emm2 = _mm_slli_epi32(emm0, 29);
1769 */
1770 LLVMValueRef const_29 = lp_build_const_v4si(29);
1771 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1772
1773 /*
1774 * get the polynom selection mask
1775 * there is one polynom for 0 <= x <= Pi/4
1776 * and another one for Pi/4<x<=Pi/2
1777 * Both branches will be computed.
1778 *
1779 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1780 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1781 */
1782
1783 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1784 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1785 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1786 emm2_3, lp_build_const_v4si(0));
1787
1788 /*
1789 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1790 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1791 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1792 */
1793 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1794 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1795 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1796
1797 /*
1798 * The magic pass: "Extended precision modular arithmetic"
1799 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1800 * xmm1 = _mm_mul_ps(y, xmm1);
1801 * xmm2 = _mm_mul_ps(y, xmm2);
1802 * xmm3 = _mm_mul_ps(y, xmm3);
1803 */
1804 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1805 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1806 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1807
1808 /*
1809 * x = _mm_add_ps(x, xmm1);
1810 * x = _mm_add_ps(x, xmm2);
1811 * x = _mm_add_ps(x, xmm3);
1812 */
1813
1814 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1815 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1816 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1817
1818 /*
1819 * Evaluate the first polynom (0 <= x <= Pi/4)
1820 *
1821 * z = _mm_mul_ps(x,x);
1822 */
1823 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1824
1825 /*
1826 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1827 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1828 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1829 */
1830 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1831 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1832 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1833
1834 /*
1835 * y = *(v4sf*)_ps_coscof_p0;
1836 * y = _mm_mul_ps(y, z);
1837 */
1838 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1839 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1840 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1841 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1842 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1843 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1844
1845
1846 /*
1847 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1848 * y = _mm_sub_ps(y, tmp);
1849 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1850 */
1851 LLVMValueRef half = lp_build_const_v4sf(0.5);
1852 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1853 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1854 LLVMValueRef one = lp_build_const_v4sf(1.0);
1855 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1856
1857 /*
1858 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1859 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1860 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1861 */
1862 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1863 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1864 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1865
1866 /*
1867 * Evaluate the second polynom (Pi/4 <= x <= 0)
1868 *
1869 * y2 = *(v4sf*)_ps_sincof_p0;
1870 * y2 = _mm_mul_ps(y2, z);
1871 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1872 * y2 = _mm_mul_ps(y2, z);
1873 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1874 * y2 = _mm_mul_ps(y2, z);
1875 * y2 = _mm_mul_ps(y2, x);
1876 * y2 = _mm_add_ps(y2, x);
1877 */
1878
1879 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1880 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1881 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1882 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1883 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1884 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1885 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1886
1887 /*
1888 * select the correct result from the two polynoms
1889 * xmm3 = poly_mask;
1890 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1891 * y = _mm_andnot_ps(xmm3, y);
1892 * y = _mm_add_ps(y,y2);
1893 */
1894 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1895 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1896 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1897 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1898 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1899 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1900
1901 /*
1902 * update the sign
1903 * y = _mm_xor_ps(y, sign_bit);
1904 */
1905 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1906 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1907 return y_result;
1908 }
1909
1910
1911 /**
1912 * Generate pow(x, y)
1913 */
1914 LLVMValueRef
1915 lp_build_pow(struct lp_build_context *bld,
1916 LLVMValueRef x,
1917 LLVMValueRef y)
1918 {
1919 /* TODO: optimize the constant case */
1920 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
1921 LLVMIsConstant(x) && LLVMIsConstant(y)) {
1922 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1923 __FUNCTION__);
1924 }
1925
1926 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1927 }
1928
1929
1930 /**
1931 * Generate exp(x)
1932 */
1933 LLVMValueRef
1934 lp_build_exp(struct lp_build_context *bld,
1935 LLVMValueRef x)
1936 {
1937 /* log2(e) = 1/log(2) */
1938 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1939
1940 assert(lp_check_value(bld->type, x));
1941
1942 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1943 }
1944
1945
1946 /**
1947 * Generate log(x)
1948 */
1949 LLVMValueRef
1950 lp_build_log(struct lp_build_context *bld,
1951 LLVMValueRef x)
1952 {
1953 /* log(2) */
1954 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1955
1956 assert(lp_check_value(bld->type, x));
1957
1958 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1959 }
1960
1961
1962 /**
1963 * Generate polynomial.
1964 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1965 */
1966 static LLVMValueRef
1967 lp_build_polynomial(struct lp_build_context *bld,
1968 LLVMValueRef x,
1969 const double *coeffs,
1970 unsigned num_coeffs)
1971 {
1972 const struct lp_type type = bld->type;
1973 LLVMValueRef res = NULL;
1974 unsigned i;
1975
1976 assert(lp_check_value(bld->type, x));
1977
1978 /* TODO: optimize the constant case */
1979 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
1980 LLVMIsConstant(x)) {
1981 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1982 __FUNCTION__);
1983 }
1984
1985 for (i = num_coeffs; i--; ) {
1986 LLVMValueRef coeff;
1987
1988 coeff = lp_build_const_vec(type, coeffs[i]);
1989
1990 if(res)
1991 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1992 else
1993 res = coeff;
1994 }
1995
1996 if(res)
1997 return res;
1998 else
1999 return bld->undef;
2000 }
2001
2002
2003 /**
2004 * Minimax polynomial fit of 2**x, in range [0, 1[
2005 */
2006 const double lp_build_exp2_polynomial[] = {
2007 #if EXP_POLY_DEGREE == 5
2008 0.999999999690134838155,
2009 0.583974334321735217258,
2010 0.164553105719676828492,
2011 0.0292811063701710962255,
2012 0.00354944426657875141846,
2013 0.000296253726543423377365
2014 #elif EXP_POLY_DEGREE == 4
2015 1.00000001502262084505,
2016 0.563586057338685991394,
2017 0.150436017652442413623,
2018 0.0243220604213317927308,
2019 0.0025359088446580436489
2020 #elif EXP_POLY_DEGREE == 3
2021 0.999925218562710312959,
2022 0.695833540494823811697,
2023 0.226067155427249155588,
2024 0.0780245226406372992967
2025 #elif EXP_POLY_DEGREE == 2
2026 1.00172476321474503578,
2027 0.657636275736077639316,
2028 0.33718943461968720704
2029 #else
2030 #error
2031 #endif
2032 };
2033
2034
2035 void
2036 lp_build_exp2_approx(struct lp_build_context *bld,
2037 LLVMValueRef x,
2038 LLVMValueRef *p_exp2_int_part,
2039 LLVMValueRef *p_frac_part,
2040 LLVMValueRef *p_exp2)
2041 {
2042 const struct lp_type type = bld->type;
2043 LLVMTypeRef vec_type = lp_build_vec_type(type);
2044 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2045 LLVMValueRef ipart = NULL;
2046 LLVMValueRef fpart = NULL;
2047 LLVMValueRef expipart = NULL;
2048 LLVMValueRef expfpart = NULL;
2049 LLVMValueRef res = NULL;
2050
2051 assert(lp_check_value(bld->type, x));
2052
2053 if(p_exp2_int_part || p_frac_part || p_exp2) {
2054 /* TODO: optimize the constant case */
2055 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2056 LLVMIsConstant(x)) {
2057 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2058 __FUNCTION__);
2059 }
2060
2061 assert(type.floating && type.width == 32);
2062
2063 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
2064 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
2065
2066 /* ipart = floor(x) */
2067 ipart = lp_build_floor(bld, x);
2068
2069 /* fpart = x - ipart */
2070 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
2071 }
2072
2073 if(p_exp2_int_part || p_exp2) {
2074 /* expipart = (float) (1 << ipart) */
2075 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
2076 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2077 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
2078 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
2079 }
2080
2081 if(p_exp2) {
2082 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2083 Elements(lp_build_exp2_polynomial));
2084
2085 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2086 }
2087
2088 if(p_exp2_int_part)
2089 *p_exp2_int_part = expipart;
2090
2091 if(p_frac_part)
2092 *p_frac_part = fpart;
2093
2094 if(p_exp2)
2095 *p_exp2 = res;
2096 }
2097
2098
2099 LLVMValueRef
2100 lp_build_exp2(struct lp_build_context *bld,
2101 LLVMValueRef x)
2102 {
2103 LLVMValueRef res;
2104 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2105 return res;
2106 }
2107
2108
2109 /**
2110 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2111 * These coefficients can be generate with
2112 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2113 */
2114 const double lp_build_log2_polynomial[] = {
2115 #if LOG_POLY_DEGREE == 6
2116 3.11578814719469302614,
2117 -3.32419399085241980044,
2118 2.59883907202499966007,
2119 -1.23152682416275988241,
2120 0.318212422185251071475,
2121 -0.0344359067839062357313
2122 #elif LOG_POLY_DEGREE == 5
2123 2.8882704548164776201,
2124 -2.52074962577807006663,
2125 1.48116647521213171641,
2126 -0.465725644288844778798,
2127 0.0596515482674574969533
2128 #elif LOG_POLY_DEGREE == 4
2129 2.61761038894603480148,
2130 -1.75647175389045657003,
2131 0.688243882994381274313,
2132 -0.107254423828329604454
2133 #elif LOG_POLY_DEGREE == 3
2134 2.28330284476918490682,
2135 -1.04913055217340124191,
2136 0.204446009836232697516
2137 #else
2138 #error
2139 #endif
2140 };
2141
2142
2143 /**
2144 * See http://www.devmaster.net/forums/showthread.php?p=43580
2145 */
2146 void
2147 lp_build_log2_approx(struct lp_build_context *bld,
2148 LLVMValueRef x,
2149 LLVMValueRef *p_exp,
2150 LLVMValueRef *p_floor_log2,
2151 LLVMValueRef *p_log2)
2152 {
2153 const struct lp_type type = bld->type;
2154 LLVMTypeRef vec_type = lp_build_vec_type(type);
2155 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2156
2157 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2158 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2159 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2160
2161 LLVMValueRef i = NULL;
2162 LLVMValueRef exp = NULL;
2163 LLVMValueRef mant = NULL;
2164 LLVMValueRef logexp = NULL;
2165 LLVMValueRef logmant = NULL;
2166 LLVMValueRef res = NULL;
2167
2168 assert(lp_check_value(bld->type, x));
2169
2170 if(p_exp || p_floor_log2 || p_log2) {
2171 /* TODO: optimize the constant case */
2172 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2173 LLVMIsConstant(x)) {
2174 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2175 __FUNCTION__);
2176 }
2177
2178 assert(type.floating && type.width == 32);
2179
2180 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2181
2182 /* exp = (float) exponent(x) */
2183 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2184 }
2185
2186 if(p_floor_log2 || p_log2) {
2187 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2188 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2189 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2190 }
2191
2192 if(p_log2) {
2193 /* mant = (float) mantissa(x) */
2194 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2195 mant = LLVMBuildOr(bld->builder, mant, one, "");
2196 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2197
2198 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2199 Elements(lp_build_log2_polynomial));
2200
2201 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2202 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2203
2204 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2205 }
2206
2207 if(p_exp) {
2208 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2209 *p_exp = exp;
2210 }
2211
2212 if(p_floor_log2)
2213 *p_floor_log2 = logexp;
2214
2215 if(p_log2)
2216 *p_log2 = res;
2217 }
2218
2219
2220 LLVMValueRef
2221 lp_build_log2(struct lp_build_context *bld,
2222 LLVMValueRef x)
2223 {
2224 LLVMValueRef res;
2225 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2226 return res;
2227 }