gallivm: Use a faster (and less accurate) log2 in lod computation.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_debug.h"
60 #include "lp_bld_arit.h"
61
62
63 #define EXP_POLY_DEGREE 3
64
65 #define LOG_POLY_DEGREE 5
66
67
68 /**
69 * Generate min(a, b)
70 * No checks for special case values of a or b = 1 or 0 are done.
71 */
72 static LLVMValueRef
73 lp_build_min_simple(struct lp_build_context *bld,
74 LLVMValueRef a,
75 LLVMValueRef b)
76 {
77 const struct lp_type type = bld->type;
78 const char *intrinsic = NULL;
79 LLVMValueRef cond;
80
81 assert(lp_check_value(type, a));
82 assert(lp_check_value(type, b));
83
84 /* TODO: optimize the constant case */
85
86 if(type.width * type.length == 128) {
87 if(type.floating) {
88 if(type.width == 32 && util_cpu_caps.has_sse)
89 intrinsic = "llvm.x86.sse.min.ps";
90 if(type.width == 64 && util_cpu_caps.has_sse2)
91 intrinsic = "llvm.x86.sse2.min.pd";
92 }
93 else {
94 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
95 intrinsic = "llvm.x86.sse2.pminu.b";
96 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
97 intrinsic = "llvm.x86.sse41.pminsb";
98 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminuw";
100 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
101 intrinsic = "llvm.x86.sse2.pmins.w";
102 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
103 intrinsic = "llvm.x86.sse41.pminud";
104 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
105 intrinsic = "llvm.x86.sse41.pminsd";
106 }
107 }
108
109 if(intrinsic)
110 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
111
112 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
113 return lp_build_select(bld, cond, a, b);
114 }
115
116
117 /**
118 * Generate max(a, b)
119 * No checks for special case values of a or b = 1 or 0 are done.
120 */
121 static LLVMValueRef
122 lp_build_max_simple(struct lp_build_context *bld,
123 LLVMValueRef a,
124 LLVMValueRef b)
125 {
126 const struct lp_type type = bld->type;
127 const char *intrinsic = NULL;
128 LLVMValueRef cond;
129
130 assert(lp_check_value(type, a));
131 assert(lp_check_value(type, b));
132
133 /* TODO: optimize the constant case */
134
135 if(type.width * type.length == 128) {
136 if(type.floating) {
137 if(type.width == 32 && util_cpu_caps.has_sse)
138 intrinsic = "llvm.x86.sse.max.ps";
139 if(type.width == 64 && util_cpu_caps.has_sse2)
140 intrinsic = "llvm.x86.sse2.max.pd";
141 }
142 else {
143 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
144 intrinsic = "llvm.x86.sse2.pmaxu.b";
145 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
146 intrinsic = "llvm.x86.sse41.pmaxsb";
147 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
148 intrinsic = "llvm.x86.sse41.pmaxuw";
149 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
150 intrinsic = "llvm.x86.sse2.pmaxs.w";
151 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
152 intrinsic = "llvm.x86.sse41.pmaxud";
153 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
154 intrinsic = "llvm.x86.sse41.pmaxsd";
155 }
156 }
157
158 if(intrinsic)
159 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
160
161 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
162 return lp_build_select(bld, cond, a, b);
163 }
164
165
166 /**
167 * Generate 1 - a, or ~a depending on bld->type.
168 */
169 LLVMValueRef
170 lp_build_comp(struct lp_build_context *bld,
171 LLVMValueRef a)
172 {
173 const struct lp_type type = bld->type;
174
175 assert(lp_check_value(type, a));
176
177 if(a == bld->one)
178 return bld->zero;
179 if(a == bld->zero)
180 return bld->one;
181
182 if(type.norm && !type.floating && !type.fixed && !type.sign) {
183 if(LLVMIsConstant(a))
184 return LLVMConstNot(a);
185 else
186 return LLVMBuildNot(bld->builder, a, "");
187 }
188
189 if(LLVMIsConstant(a))
190 if (type.floating)
191 return LLVMConstFSub(bld->one, a);
192 else
193 return LLVMConstSub(bld->one, a);
194 else
195 if (type.floating)
196 return LLVMBuildFSub(bld->builder, bld->one, a, "");
197 else
198 return LLVMBuildSub(bld->builder, bld->one, a, "");
199 }
200
201
202 /**
203 * Generate a + b
204 */
205 LLVMValueRef
206 lp_build_add(struct lp_build_context *bld,
207 LLVMValueRef a,
208 LLVMValueRef b)
209 {
210 const struct lp_type type = bld->type;
211 LLVMValueRef res;
212
213 assert(lp_check_value(type, a));
214 assert(lp_check_value(type, b));
215
216 if(a == bld->zero)
217 return b;
218 if(b == bld->zero)
219 return a;
220 if(a == bld->undef || b == bld->undef)
221 return bld->undef;
222
223 if(bld->type.norm) {
224 const char *intrinsic = NULL;
225
226 if(a == bld->one || b == bld->one)
227 return bld->one;
228
229 if(util_cpu_caps.has_sse2 &&
230 type.width * type.length == 128 &&
231 !type.floating && !type.fixed) {
232 if(type.width == 8)
233 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
234 if(type.width == 16)
235 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
236 }
237
238 if(intrinsic)
239 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
240 }
241
242 if(LLVMIsConstant(a) && LLVMIsConstant(b))
243 if (type.floating)
244 res = LLVMConstFAdd(a, b);
245 else
246 res = LLVMConstAdd(a, b);
247 else
248 if (type.floating)
249 res = LLVMBuildFAdd(bld->builder, a, b, "");
250 else
251 res = LLVMBuildAdd(bld->builder, a, b, "");
252
253 /* clamp to ceiling of 1.0 */
254 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
255 res = lp_build_min_simple(bld, res, bld->one);
256
257 /* XXX clamp to floor of -1 or 0??? */
258
259 return res;
260 }
261
262
263 /** Return the scalar sum of the elements of a */
264 LLVMValueRef
265 lp_build_sum_vector(struct lp_build_context *bld,
266 LLVMValueRef a)
267 {
268 const struct lp_type type = bld->type;
269 LLVMValueRef index, res;
270 unsigned i;
271
272 assert(lp_check_value(type, a));
273
274 if (type.length == 1) {
275 return a;
276 }
277
278 assert(!bld->type.norm);
279
280 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
281 res = LLVMBuildExtractElement(bld->builder, a, index, "");
282
283 for (i = 1; i < type.length; i++) {
284 index = LLVMConstInt(LLVMInt32Type(), i, 0);
285 if (type.floating)
286 res = LLVMBuildFAdd(bld->builder, res,
287 LLVMBuildExtractElement(bld->builder,
288 a, index, ""),
289 "");
290 else
291 res = LLVMBuildAdd(bld->builder, res,
292 LLVMBuildExtractElement(bld->builder,
293 a, index, ""),
294 "");
295 }
296
297 return res;
298 }
299
300
301 /**
302 * Generate a - b
303 */
304 LLVMValueRef
305 lp_build_sub(struct lp_build_context *bld,
306 LLVMValueRef a,
307 LLVMValueRef b)
308 {
309 const struct lp_type type = bld->type;
310 LLVMValueRef res;
311
312 assert(lp_check_value(type, a));
313 assert(lp_check_value(type, b));
314
315 if(b == bld->zero)
316 return a;
317 if(a == bld->undef || b == bld->undef)
318 return bld->undef;
319 if(a == b)
320 return bld->zero;
321
322 if(bld->type.norm) {
323 const char *intrinsic = NULL;
324
325 if(b == bld->one)
326 return bld->zero;
327
328 if(util_cpu_caps.has_sse2 &&
329 type.width * type.length == 128 &&
330 !type.floating && !type.fixed) {
331 if(type.width == 8)
332 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
333 if(type.width == 16)
334 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
335 }
336
337 if(intrinsic)
338 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
339 }
340
341 if(LLVMIsConstant(a) && LLVMIsConstant(b))
342 if (type.floating)
343 res = LLVMConstFSub(a, b);
344 else
345 res = LLVMConstSub(a, b);
346 else
347 if (type.floating)
348 res = LLVMBuildFSub(bld->builder, a, b, "");
349 else
350 res = LLVMBuildSub(bld->builder, a, b, "");
351
352 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
353 res = lp_build_max_simple(bld, res, bld->zero);
354
355 return res;
356 }
357
358
359 /**
360 * Normalized 8bit multiplication.
361 *
362 * - alpha plus one
363 *
364 * makes the following approximation to the division (Sree)
365 *
366 * a*b/255 ~= (a*(b + 1)) >> 256
367 *
368 * which is the fastest method that satisfies the following OpenGL criteria
369 *
370 * 0*0 = 0 and 255*255 = 255
371 *
372 * - geometric series
373 *
374 * takes the geometric series approximation to the division
375 *
376 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
377 *
378 * in this case just the first two terms to fit in 16bit arithmetic
379 *
380 * t/255 ~= (t + (t >> 8)) >> 8
381 *
382 * note that just by itself it doesn't satisfies the OpenGL criteria, as
383 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
384 * must be used
385 *
386 * - geometric series plus rounding
387 *
388 * when using a geometric series division instead of truncating the result
389 * use roundoff in the approximation (Jim Blinn)
390 *
391 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
392 *
393 * achieving the exact results
394 *
395 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
396 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
397 * @sa Michael Herf, The "double blend trick", May 2000,
398 * http://www.stereopsis.com/doubleblend.html
399 */
400 static LLVMValueRef
401 lp_build_mul_u8n(LLVMBuilderRef builder,
402 struct lp_type i16_type,
403 LLVMValueRef a, LLVMValueRef b)
404 {
405 LLVMValueRef c8;
406 LLVMValueRef ab;
407
408 assert(!i16_type.floating);
409 assert(lp_check_value(i16_type, a));
410 assert(lp_check_value(i16_type, b));
411
412 c8 = lp_build_const_int_vec(i16_type, 8);
413
414 #if 0
415
416 /* a*b/255 ~= (a*(b + 1)) >> 256 */
417 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
418 ab = LLVMBuildMul(builder, a, b, "");
419
420 #else
421
422 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
423 ab = LLVMBuildMul(builder, a, b, "");
424 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
425 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
426
427 #endif
428
429 ab = LLVMBuildLShr(builder, ab, c8, "");
430
431 return ab;
432 }
433
434
435 /**
436 * Generate a * b
437 */
438 LLVMValueRef
439 lp_build_mul(struct lp_build_context *bld,
440 LLVMValueRef a,
441 LLVMValueRef b)
442 {
443 const struct lp_type type = bld->type;
444 LLVMValueRef shift;
445 LLVMValueRef res;
446
447 assert(lp_check_value(type, a));
448 assert(lp_check_value(type, b));
449
450 if(a == bld->zero)
451 return bld->zero;
452 if(a == bld->one)
453 return b;
454 if(b == bld->zero)
455 return bld->zero;
456 if(b == bld->one)
457 return a;
458 if(a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if(!type.floating && !type.fixed && type.norm) {
462 if(type.width == 8) {
463 struct lp_type i16_type = lp_wider_type(type);
464 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
465
466 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
467 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
468
469 /* PMULLW, PSRLW, PADDW */
470 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
471 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
472
473 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
474
475 return ab;
476 }
477
478 /* FIXME */
479 assert(0);
480 }
481
482 if(type.fixed)
483 shift = lp_build_const_int_vec(type, type.width/2);
484 else
485 shift = NULL;
486
487 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
488 if (type.floating)
489 res = LLVMConstFMul(a, b);
490 else
491 res = LLVMConstMul(a, b);
492 if(shift) {
493 if(type.sign)
494 res = LLVMConstAShr(res, shift);
495 else
496 res = LLVMConstLShr(res, shift);
497 }
498 }
499 else {
500 if (type.floating)
501 res = LLVMBuildFMul(bld->builder, a, b, "");
502 else
503 res = LLVMBuildMul(bld->builder, a, b, "");
504 if(shift) {
505 if(type.sign)
506 res = LLVMBuildAShr(bld->builder, res, shift, "");
507 else
508 res = LLVMBuildLShr(bld->builder, res, shift, "");
509 }
510 }
511
512 return res;
513 }
514
515
516 /**
517 * Small vector x scale multiplication optimization.
518 */
519 LLVMValueRef
520 lp_build_mul_imm(struct lp_build_context *bld,
521 LLVMValueRef a,
522 int b)
523 {
524 LLVMValueRef factor;
525
526 assert(lp_check_value(bld->type, a));
527
528 if(b == 0)
529 return bld->zero;
530
531 if(b == 1)
532 return a;
533
534 if(b == -1)
535 return lp_build_negate(bld, a);
536
537 if(b == 2 && bld->type.floating)
538 return lp_build_add(bld, a, a);
539
540 if(util_is_power_of_two(b)) {
541 unsigned shift = ffs(b) - 1;
542
543 if(bld->type.floating) {
544 #if 0
545 /*
546 * Power of two multiplication by directly manipulating the mantissa.
547 *
548 * XXX: This might not be always faster, it will introduce a small error
549 * for multiplication by zero, and it will produce wrong results
550 * for Inf and NaN.
551 */
552 unsigned mantissa = lp_mantissa(bld->type);
553 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
554 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
555 a = LLVMBuildAdd(bld->builder, a, factor, "");
556 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
557 return a;
558 #endif
559 }
560 else {
561 factor = lp_build_const_vec(bld->type, shift);
562 return LLVMBuildShl(bld->builder, a, factor, "");
563 }
564 }
565
566 factor = lp_build_const_vec(bld->type, (double)b);
567 return lp_build_mul(bld, a, factor);
568 }
569
570
571 /**
572 * Generate a / b
573 */
574 LLVMValueRef
575 lp_build_div(struct lp_build_context *bld,
576 LLVMValueRef a,
577 LLVMValueRef b)
578 {
579 const struct lp_type type = bld->type;
580
581 assert(lp_check_value(type, a));
582 assert(lp_check_value(type, b));
583
584 if(a == bld->zero)
585 return bld->zero;
586 if(a == bld->one)
587 return lp_build_rcp(bld, b);
588 if(b == bld->zero)
589 return bld->undef;
590 if(b == bld->one)
591 return a;
592 if(a == bld->undef || b == bld->undef)
593 return bld->undef;
594
595 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
596 if (type.floating)
597 return LLVMConstFDiv(a, b);
598 else if (type.sign)
599 return LLVMConstSDiv(a, b);
600 else
601 return LLVMConstUDiv(a, b);
602 }
603
604 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
605 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
606
607 if (type.floating)
608 return LLVMBuildFDiv(bld->builder, a, b, "");
609 else if (type.sign)
610 return LLVMBuildSDiv(bld->builder, a, b, "");
611 else
612 return LLVMBuildUDiv(bld->builder, a, b, "");
613 }
614
615
616 /**
617 * Linear interpolation -- without any checks.
618 *
619 * @sa http://www.stereopsis.com/doubleblend.html
620 */
621 static INLINE LLVMValueRef
622 lp_build_lerp_simple(struct lp_build_context *bld,
623 LLVMValueRef x,
624 LLVMValueRef v0,
625 LLVMValueRef v1)
626 {
627 LLVMValueRef delta;
628 LLVMValueRef res;
629
630 assert(lp_check_value(bld->type, x));
631 assert(lp_check_value(bld->type, v0));
632 assert(lp_check_value(bld->type, v1));
633
634 delta = lp_build_sub(bld, v1, v0);
635
636 res = lp_build_mul(bld, x, delta);
637
638 res = lp_build_add(bld, v0, res);
639
640 if (bld->type.fixed) {
641 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
642 * but it will be wrong for other uses. Basically we need a more
643 * powerful lp_type, capable of further distinguishing the values
644 * interpretation from the value storage. */
645 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
646 }
647
648 return res;
649 }
650
651
652 /**
653 * Linear interpolation.
654 */
655 LLVMValueRef
656 lp_build_lerp(struct lp_build_context *bld,
657 LLVMValueRef x,
658 LLVMValueRef v0,
659 LLVMValueRef v1)
660 {
661 const struct lp_type type = bld->type;
662 LLVMValueRef res;
663
664 assert(lp_check_value(type, x));
665 assert(lp_check_value(type, v0));
666 assert(lp_check_value(type, v1));
667
668 if (type.norm) {
669 struct lp_type wide_type;
670 struct lp_build_context wide_bld;
671 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
672 LLVMValueRef shift;
673
674 assert(type.length >= 2);
675 assert(!type.sign);
676
677 /*
678 * Create a wider type, enough to hold the intermediate result of the
679 * multiplication.
680 */
681 memset(&wide_type, 0, sizeof wide_type);
682 wide_type.fixed = TRUE;
683 wide_type.width = type.width*2;
684 wide_type.length = type.length/2;
685
686 lp_build_context_init(&wide_bld, bld->builder, wide_type);
687
688 lp_build_unpack2(bld->builder, type, wide_type, x, &xl, &xh);
689 lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h);
690 lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h);
691
692 /*
693 * Scale x from [0, 255] to [0, 256]
694 */
695
696 shift = lp_build_const_int_vec(wide_type, type.width - 1);
697
698 xl = lp_build_add(&wide_bld, xl,
699 LLVMBuildAShr(bld->builder, xl, shift, ""));
700 xh = lp_build_add(&wide_bld, xh,
701 LLVMBuildAShr(bld->builder, xh, shift, ""));
702
703 /*
704 * Lerp both halves.
705 */
706
707 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
708 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
709
710 res = lp_build_pack2(bld->builder, wide_type, type, resl, resh);
711 } else {
712 res = lp_build_lerp_simple(bld, x, v0, v1);
713 }
714
715 return res;
716 }
717
718
719 LLVMValueRef
720 lp_build_lerp_2d(struct lp_build_context *bld,
721 LLVMValueRef x,
722 LLVMValueRef y,
723 LLVMValueRef v00,
724 LLVMValueRef v01,
725 LLVMValueRef v10,
726 LLVMValueRef v11)
727 {
728 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
729 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
730 return lp_build_lerp(bld, y, v0, v1);
731 }
732
733
734 /**
735 * Generate min(a, b)
736 * Do checks for special cases.
737 */
738 LLVMValueRef
739 lp_build_min(struct lp_build_context *bld,
740 LLVMValueRef a,
741 LLVMValueRef b)
742 {
743 assert(lp_check_value(bld->type, a));
744 assert(lp_check_value(bld->type, b));
745
746 if(a == bld->undef || b == bld->undef)
747 return bld->undef;
748
749 if(a == b)
750 return a;
751
752 if(bld->type.norm) {
753 if(a == bld->zero || b == bld->zero)
754 return bld->zero;
755 if(a == bld->one)
756 return b;
757 if(b == bld->one)
758 return a;
759 }
760
761 return lp_build_min_simple(bld, a, b);
762 }
763
764
765 /**
766 * Generate max(a, b)
767 * Do checks for special cases.
768 */
769 LLVMValueRef
770 lp_build_max(struct lp_build_context *bld,
771 LLVMValueRef a,
772 LLVMValueRef b)
773 {
774 assert(lp_check_value(bld->type, a));
775 assert(lp_check_value(bld->type, b));
776
777 if(a == bld->undef || b == bld->undef)
778 return bld->undef;
779
780 if(a == b)
781 return a;
782
783 if(bld->type.norm) {
784 if(a == bld->one || b == bld->one)
785 return bld->one;
786 if(a == bld->zero)
787 return b;
788 if(b == bld->zero)
789 return a;
790 }
791
792 return lp_build_max_simple(bld, a, b);
793 }
794
795
796 /**
797 * Generate clamp(a, min, max)
798 * Do checks for special cases.
799 */
800 LLVMValueRef
801 lp_build_clamp(struct lp_build_context *bld,
802 LLVMValueRef a,
803 LLVMValueRef min,
804 LLVMValueRef max)
805 {
806 assert(lp_check_value(bld->type, a));
807 assert(lp_check_value(bld->type, min));
808 assert(lp_check_value(bld->type, max));
809
810 a = lp_build_min(bld, a, max);
811 a = lp_build_max(bld, a, min);
812 return a;
813 }
814
815
816 /**
817 * Generate abs(a)
818 */
819 LLVMValueRef
820 lp_build_abs(struct lp_build_context *bld,
821 LLVMValueRef a)
822 {
823 const struct lp_type type = bld->type;
824 LLVMTypeRef vec_type = lp_build_vec_type(type);
825
826 assert(lp_check_value(type, a));
827
828 if(!type.sign)
829 return a;
830
831 if(type.floating) {
832 /* Mask out the sign bit */
833 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
834 unsigned long long absMask = ~(1ULL << (type.width - 1));
835 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
836 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
837 a = LLVMBuildAnd(bld->builder, a, mask, "");
838 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
839 return a;
840 }
841
842 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
843 switch(type.width) {
844 case 8:
845 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
846 case 16:
847 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
848 case 32:
849 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
850 }
851 }
852
853 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
854 }
855
856
857 LLVMValueRef
858 lp_build_negate(struct lp_build_context *bld,
859 LLVMValueRef a)
860 {
861 assert(lp_check_value(bld->type, a));
862
863 #if HAVE_LLVM >= 0x0207
864 if (bld->type.floating)
865 a = LLVMBuildFNeg(bld->builder, a, "");
866 else
867 #endif
868 a = LLVMBuildNeg(bld->builder, a, "");
869
870 return a;
871 }
872
873
874 /** Return -1, 0 or +1 depending on the sign of a */
875 LLVMValueRef
876 lp_build_sgn(struct lp_build_context *bld,
877 LLVMValueRef a)
878 {
879 const struct lp_type type = bld->type;
880 LLVMValueRef cond;
881 LLVMValueRef res;
882
883 assert(lp_check_value(type, a));
884
885 /* Handle non-zero case */
886 if(!type.sign) {
887 /* if not zero then sign must be positive */
888 res = bld->one;
889 }
890 else if(type.floating) {
891 LLVMTypeRef vec_type;
892 LLVMTypeRef int_type;
893 LLVMValueRef mask;
894 LLVMValueRef sign;
895 LLVMValueRef one;
896 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
897
898 int_type = lp_build_int_vec_type(type);
899 vec_type = lp_build_vec_type(type);
900 mask = lp_build_const_int_vec(type, maskBit);
901
902 /* Take the sign bit and add it to 1 constant */
903 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
904 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
905 one = LLVMConstBitCast(bld->one, int_type);
906 res = LLVMBuildOr(bld->builder, sign, one, "");
907 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
908 }
909 else
910 {
911 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
912 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
913 res = lp_build_select(bld, cond, bld->one, minus_one);
914 }
915
916 /* Handle zero */
917 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
918 res = lp_build_select(bld, cond, bld->zero, res);
919
920 return res;
921 }
922
923
924 /**
925 * Set the sign of float vector 'a' according to 'sign'.
926 * If sign==0, return abs(a).
927 * If sign==1, return -abs(a);
928 * Other values for sign produce undefined results.
929 */
930 LLVMValueRef
931 lp_build_set_sign(struct lp_build_context *bld,
932 LLVMValueRef a, LLVMValueRef sign)
933 {
934 const struct lp_type type = bld->type;
935 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
936 LLVMTypeRef vec_type = lp_build_vec_type(type);
937 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
938 LLVMValueRef mask = lp_build_const_int_vec(type,
939 ~((unsigned long long) 1 << (type.width - 1)));
940 LLVMValueRef val, res;
941
942 assert(type.floating);
943 assert(lp_check_value(type, a));
944
945 /* val = reinterpret_cast<int>(a) */
946 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
947 /* val = val & mask */
948 val = LLVMBuildAnd(bld->builder, val, mask, "");
949 /* sign = sign << shift */
950 sign = LLVMBuildShl(bld->builder, sign, shift, "");
951 /* res = val | sign */
952 res = LLVMBuildOr(bld->builder, val, sign, "");
953 /* res = reinterpret_cast<float>(res) */
954 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
955
956 return res;
957 }
958
959
960 /**
961 * Convert vector of (or scalar) int to vector of (or scalar) float.
962 */
963 LLVMValueRef
964 lp_build_int_to_float(struct lp_build_context *bld,
965 LLVMValueRef a)
966 {
967 const struct lp_type type = bld->type;
968 LLVMTypeRef vec_type = lp_build_vec_type(type);
969
970 assert(type.floating);
971
972 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
973 }
974
975
976
977 enum lp_build_round_sse41_mode
978 {
979 LP_BUILD_ROUND_SSE41_NEAREST = 0,
980 LP_BUILD_ROUND_SSE41_FLOOR = 1,
981 LP_BUILD_ROUND_SSE41_CEIL = 2,
982 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
983 };
984
985
986 static INLINE LLVMValueRef
987 lp_build_round_sse41(struct lp_build_context *bld,
988 LLVMValueRef a,
989 enum lp_build_round_sse41_mode mode)
990 {
991 const struct lp_type type = bld->type;
992 LLVMTypeRef i32t = LLVMInt32Type();
993 const char *intrinsic;
994 LLVMValueRef res;
995
996 assert(type.floating);
997
998 assert(lp_check_value(type, a));
999 assert(util_cpu_caps.has_sse4_1);
1000
1001 if (type.length == 1) {
1002 LLVMTypeRef vec_type;
1003 LLVMValueRef undef;
1004 LLVMValueRef args[3];
1005 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1006
1007 switch(type.width) {
1008 case 32:
1009 intrinsic = "llvm.x86.sse41.round.ss";
1010 break;
1011 case 64:
1012 intrinsic = "llvm.x86.sse41.round.sd";
1013 break;
1014 default:
1015 assert(0);
1016 return bld->undef;
1017 }
1018
1019 vec_type = LLVMVectorType(bld->elem_type, 4);
1020
1021 undef = LLVMGetUndef(vec_type);
1022
1023 args[0] = undef;
1024 args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
1025 args[2] = LLVMConstInt(i32t, mode, 0);
1026
1027 res = lp_build_intrinsic(bld->builder, intrinsic,
1028 vec_type, args, Elements(args));
1029
1030 res = LLVMBuildExtractElement(bld->builder, res, index0, "");
1031 }
1032 else {
1033 assert(type.width*type.length == 128);
1034
1035 switch(type.width) {
1036 case 32:
1037 intrinsic = "llvm.x86.sse41.round.ps";
1038 break;
1039 case 64:
1040 intrinsic = "llvm.x86.sse41.round.pd";
1041 break;
1042 default:
1043 assert(0);
1044 return bld->undef;
1045 }
1046
1047 res = lp_build_intrinsic_binary(bld->builder, intrinsic,
1048 bld->vec_type, a,
1049 LLVMConstInt(i32t, mode, 0));
1050 }
1051
1052 return res;
1053 }
1054
1055
1056 /**
1057 * Return the integer part of a float (vector) value. The returned value is
1058 * a float (vector).
1059 * Ex: trunc(-1.5) = 1.0
1060 */
1061 LLVMValueRef
1062 lp_build_trunc(struct lp_build_context *bld,
1063 LLVMValueRef a)
1064 {
1065 const struct lp_type type = bld->type;
1066
1067 assert(type.floating);
1068 assert(lp_check_value(type, a));
1069
1070 if (util_cpu_caps.has_sse4_1 &&
1071 (type.length == 1 || type.width*type.length == 128)) {
1072 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1073 }
1074 else {
1075 LLVMTypeRef vec_type = lp_build_vec_type(type);
1076 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1077 LLVMValueRef res;
1078 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1079 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1080 return res;
1081 }
1082 }
1083
1084
1085 /**
1086 * Return float (vector) rounded to nearest integer (vector). The returned
1087 * value is a float (vector).
1088 * Ex: round(0.9) = 1.0
1089 * Ex: round(-1.5) = -2.0
1090 */
1091 LLVMValueRef
1092 lp_build_round(struct lp_build_context *bld,
1093 LLVMValueRef a)
1094 {
1095 const struct lp_type type = bld->type;
1096
1097 assert(type.floating);
1098 assert(lp_check_value(type, a));
1099
1100 if (util_cpu_caps.has_sse4_1 &&
1101 (type.length == 1 || type.width*type.length == 128)) {
1102 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1103 }
1104 else {
1105 LLVMTypeRef vec_type = lp_build_vec_type(type);
1106 LLVMValueRef res;
1107 res = lp_build_iround(bld, a);
1108 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1109 return res;
1110 }
1111 }
1112
1113
1114 /**
1115 * Return floor of float (vector), result is a float (vector)
1116 * Ex: floor(1.1) = 1.0
1117 * Ex: floor(-1.1) = -2.0
1118 */
1119 LLVMValueRef
1120 lp_build_floor(struct lp_build_context *bld,
1121 LLVMValueRef a)
1122 {
1123 const struct lp_type type = bld->type;
1124
1125 assert(type.floating);
1126 assert(lp_check_value(type, a));
1127
1128 if (util_cpu_caps.has_sse4_1 &&
1129 (type.length == 1 || type.width*type.length == 128)) {
1130 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1131 }
1132 else {
1133 LLVMTypeRef vec_type = lp_build_vec_type(type);
1134 LLVMValueRef res;
1135 res = lp_build_ifloor(bld, a);
1136 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1137 return res;
1138 }
1139 }
1140
1141
1142 /**
1143 * Return ceiling of float (vector), returning float (vector).
1144 * Ex: ceil( 1.1) = 2.0
1145 * Ex: ceil(-1.1) = -1.0
1146 */
1147 LLVMValueRef
1148 lp_build_ceil(struct lp_build_context *bld,
1149 LLVMValueRef a)
1150 {
1151 const struct lp_type type = bld->type;
1152
1153 assert(type.floating);
1154 assert(lp_check_value(type, a));
1155
1156 if (util_cpu_caps.has_sse4_1 &&
1157 (type.length == 1 || type.width*type.length == 128)) {
1158 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1159 }
1160 else {
1161 LLVMTypeRef vec_type = lp_build_vec_type(type);
1162 LLVMValueRef res;
1163 res = lp_build_iceil(bld, a);
1164 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1165 return res;
1166 }
1167 }
1168
1169
1170 /**
1171 * Return fractional part of 'a' computed as a - floor(a)
1172 * Typically used in texture coord arithmetic.
1173 */
1174 LLVMValueRef
1175 lp_build_fract(struct lp_build_context *bld,
1176 LLVMValueRef a)
1177 {
1178 assert(bld->type.floating);
1179 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1180 }
1181
1182
1183 /**
1184 * Return the integer part of a float (vector) value. The returned value is
1185 * an integer (vector).
1186 * Ex: itrunc(-1.5) = 1
1187 */
1188 LLVMValueRef
1189 lp_build_itrunc(struct lp_build_context *bld,
1190 LLVMValueRef a)
1191 {
1192 const struct lp_type type = bld->type;
1193 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1194
1195 assert(type.floating);
1196 assert(lp_check_value(type, a));
1197
1198 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1199 }
1200
1201
1202 /**
1203 * Return float (vector) rounded to nearest integer (vector). The returned
1204 * value is an integer (vector).
1205 * Ex: iround(0.9) = 1
1206 * Ex: iround(-1.5) = -2
1207 */
1208 LLVMValueRef
1209 lp_build_iround(struct lp_build_context *bld,
1210 LLVMValueRef a)
1211 {
1212 const struct lp_type type = bld->type;
1213 LLVMTypeRef int_vec_type = bld->int_vec_type;
1214 LLVMValueRef res;
1215
1216 assert(type.floating);
1217
1218 assert(lp_check_value(type, a));
1219
1220 if (util_cpu_caps.has_sse4_1 &&
1221 (type.length == 1 || type.width*type.length == 128)) {
1222 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1223 }
1224 else {
1225 LLVMValueRef half;
1226
1227 half = lp_build_const_vec(type, 0.5);
1228
1229 if (type.sign) {
1230 LLVMTypeRef vec_type = bld->vec_type;
1231 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1232 LLVMValueRef sign;
1233
1234 /* get sign bit */
1235 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1236 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1237
1238 /* sign * 0.5 */
1239 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1240 half = LLVMBuildOr(bld->builder, sign, half, "");
1241 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1242 }
1243
1244 res = LLVMBuildFAdd(bld->builder, a, half, "");
1245 }
1246
1247 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1248
1249 return res;
1250 }
1251
1252
1253 /**
1254 * Return floor of float (vector), result is an int (vector)
1255 * Ex: ifloor(1.1) = 1.0
1256 * Ex: ifloor(-1.1) = -2.0
1257 */
1258 LLVMValueRef
1259 lp_build_ifloor(struct lp_build_context *bld,
1260 LLVMValueRef a)
1261 {
1262 const struct lp_type type = bld->type;
1263 LLVMTypeRef int_vec_type = bld->int_vec_type;
1264 LLVMValueRef res;
1265
1266 assert(type.floating);
1267 assert(lp_check_value(type, a));
1268
1269 if (util_cpu_caps.has_sse4_1 &&
1270 (type.length == 1 || type.width*type.length == 128)) {
1271 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1272 }
1273 else {
1274 res = a;
1275
1276 if (type.sign) {
1277 /* Take the sign bit and add it to 1 constant */
1278 LLVMTypeRef vec_type = bld->vec_type;
1279 unsigned mantissa = lp_mantissa(type);
1280 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1281 LLVMValueRef sign;
1282 LLVMValueRef offset;
1283
1284 /* sign = a < 0 ? ~0 : 0 */
1285 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1286 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1287 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1288
1289 /* offset = -0.99999(9)f */
1290 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1291 offset = LLVMConstBitCast(offset, int_vec_type);
1292
1293 /* offset = a < 0 ? offset : 0.0f */
1294 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1295 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1296
1297 res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res");
1298 }
1299 }
1300
1301 /* round to nearest (toward zero) */
1302 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1303
1304 return res;
1305 }
1306
1307
1308 /**
1309 * Return ceiling of float (vector), returning int (vector).
1310 * Ex: iceil( 1.1) = 2
1311 * Ex: iceil(-1.1) = -1
1312 */
1313 LLVMValueRef
1314 lp_build_iceil(struct lp_build_context *bld,
1315 LLVMValueRef a)
1316 {
1317 const struct lp_type type = bld->type;
1318 LLVMTypeRef int_vec_type = bld->int_vec_type;
1319 LLVMValueRef res;
1320
1321 assert(type.floating);
1322 assert(lp_check_value(type, a));
1323
1324 if (util_cpu_caps.has_sse4_1 &&
1325 (type.length == 1 || type.width*type.length == 128)) {
1326 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1327 }
1328 else {
1329 LLVMTypeRef vec_type = bld->vec_type;
1330 unsigned mantissa = lp_mantissa(type);
1331 LLVMValueRef offset;
1332
1333 /* offset = 0.99999(9)f */
1334 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1335
1336 if (type.sign) {
1337 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1338 LLVMValueRef sign;
1339
1340 /* sign = a < 0 ? 0 : ~0 */
1341 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1342 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1343 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1344 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1345
1346 /* offset = a < 0 ? 0.0 : offset */
1347 offset = LLVMConstBitCast(offset, int_vec_type);
1348 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1349 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1350 }
1351
1352 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1353 }
1354
1355 /* round to nearest (toward zero) */
1356 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1357
1358 return res;
1359 }
1360
1361
1362 LLVMValueRef
1363 lp_build_sqrt(struct lp_build_context *bld,
1364 LLVMValueRef a)
1365 {
1366 const struct lp_type type = bld->type;
1367 LLVMTypeRef vec_type = lp_build_vec_type(type);
1368 char intrinsic[32];
1369
1370 assert(lp_check_value(type, a));
1371
1372 /* TODO: optimize the constant case */
1373 /* TODO: optimize the constant case */
1374
1375 assert(type.floating);
1376 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1377
1378 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1379 }
1380
1381
1382 /**
1383 * Do one Newton-Raphson step to improve reciprocate precision:
1384 *
1385 * x_{i+1} = x_i * (2 - a * x_i)
1386 *
1387 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1388 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1389 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1390 * halo. It would be necessary to clamp the argument to prevent this.
1391 *
1392 * See also:
1393 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1394 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1395 */
1396 static INLINE LLVMValueRef
1397 lp_build_rcp_refine(struct lp_build_context *bld,
1398 LLVMValueRef a,
1399 LLVMValueRef rcp_a)
1400 {
1401 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1402 LLVMValueRef res;
1403
1404 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1405 res = LLVMBuildFSub(bld->builder, two, res, "");
1406 res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1407
1408 return res;
1409 }
1410
1411
1412 LLVMValueRef
1413 lp_build_rcp(struct lp_build_context *bld,
1414 LLVMValueRef a)
1415 {
1416 const struct lp_type type = bld->type;
1417
1418 assert(lp_check_value(type, a));
1419
1420 if(a == bld->zero)
1421 return bld->undef;
1422 if(a == bld->one)
1423 return bld->one;
1424 if(a == bld->undef)
1425 return bld->undef;
1426
1427 assert(type.floating);
1428
1429 if(LLVMIsConstant(a))
1430 return LLVMConstFDiv(bld->one, a);
1431
1432 /*
1433 * We don't use RCPPS because:
1434 * - it only has 10bits of precision
1435 * - it doesn't even get the reciprocate of 1.0 exactly
1436 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1437 * - for recent processors the benefit over DIVPS is marginal, a case
1438 * depedent
1439 *
1440 * We could still use it on certain processors if benchmarks show that the
1441 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1442 * particular uses that require less workarounds.
1443 */
1444
1445 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1446 const unsigned num_iterations = 0;
1447 LLVMValueRef res;
1448 unsigned i;
1449
1450 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1451
1452 for (i = 0; i < num_iterations; ++i) {
1453 res = lp_build_rcp_refine(bld, a, res);
1454 }
1455
1456 return res;
1457 }
1458
1459 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1460 }
1461
1462
1463 /**
1464 * Do one Newton-Raphson step to improve rsqrt precision:
1465 *
1466 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1467 *
1468 * See also:
1469 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1470 */
1471 static INLINE LLVMValueRef
1472 lp_build_rsqrt_refine(struct lp_build_context *bld,
1473 LLVMValueRef a,
1474 LLVMValueRef rsqrt_a)
1475 {
1476 LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1477 LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1478 LLVMValueRef res;
1479
1480 res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1481 res = LLVMBuildFMul(bld->builder, a, res, "");
1482 res = LLVMBuildFSub(bld->builder, three, res, "");
1483 res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1484 res = LLVMBuildFMul(bld->builder, half, res, "");
1485
1486 return res;
1487 }
1488
1489
1490 /**
1491 * Generate 1/sqrt(a)
1492 */
1493 LLVMValueRef
1494 lp_build_rsqrt(struct lp_build_context *bld,
1495 LLVMValueRef a)
1496 {
1497 const struct lp_type type = bld->type;
1498
1499 assert(lp_check_value(type, a));
1500
1501 assert(type.floating);
1502
1503 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1504 const unsigned num_iterations = 0;
1505 LLVMValueRef res;
1506 unsigned i;
1507
1508 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1509
1510 for (i = 0; i < num_iterations; ++i) {
1511 res = lp_build_rsqrt_refine(bld, a, res);
1512 }
1513
1514 return res;
1515 }
1516
1517 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1518 }
1519
1520
1521 static inline LLVMValueRef
1522 lp_build_const_v4si(unsigned long value)
1523 {
1524 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1525 LLVMValueRef elements[4] = { element, element, element, element };
1526 return LLVMConstVector(elements, 4);
1527 }
1528
1529 static inline LLVMValueRef
1530 lp_build_const_v4sf(float value)
1531 {
1532 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1533 LLVMValueRef elements[4] = { element, element, element, element };
1534 return LLVMConstVector(elements, 4);
1535 }
1536
1537
1538 /**
1539 * Generate sin(a) using SSE2
1540 */
1541 LLVMValueRef
1542 lp_build_sin(struct lp_build_context *bld,
1543 LLVMValueRef a)
1544 {
1545 struct lp_type int_type = lp_int_type(bld->type);
1546 LLVMBuilderRef b = bld->builder;
1547 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1548 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1549
1550 /*
1551 * take the absolute value,
1552 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1553 */
1554
1555 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1556 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1557
1558 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1559 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1560
1561 /*
1562 * extract the sign bit (upper one)
1563 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1564 */
1565 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1566 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1567
1568 /*
1569 * scale by 4/Pi
1570 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1571 */
1572
1573 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1574 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1575
1576 /*
1577 * store the integer part of y in mm0
1578 * emm2 = _mm_cvttps_epi32(y);
1579 */
1580
1581 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1582
1583 /*
1584 * j=(j+1) & (~1) (see the cephes sources)
1585 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1586 */
1587
1588 LLVMValueRef all_one = lp_build_const_v4si(1);
1589 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1590 /*
1591 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1592 */
1593 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1594 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1595
1596 /*
1597 * y = _mm_cvtepi32_ps(emm2);
1598 */
1599 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1600
1601 /* get the swap sign flag
1602 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1603 */
1604 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1605 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1606
1607 /*
1608 * emm2 = _mm_slli_epi32(emm0, 29);
1609 */
1610 LLVMValueRef const_29 = lp_build_const_v4si(29);
1611 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1612
1613 /*
1614 * get the polynom selection mask
1615 * there is one polynom for 0 <= x <= Pi/4
1616 * and another one for Pi/4<x<=Pi/2
1617 * Both branches will be computed.
1618 *
1619 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1620 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1621 */
1622
1623 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1624 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1625 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1626 emm2_3, lp_build_const_v4si(0));
1627 /*
1628 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1629 */
1630 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1631
1632 /*
1633 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1634 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1635 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1636 */
1637 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1638 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1639 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1640
1641 /*
1642 * The magic pass: "Extended precision modular arithmetic"
1643 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1644 * xmm1 = _mm_mul_ps(y, xmm1);
1645 * xmm2 = _mm_mul_ps(y, xmm2);
1646 * xmm3 = _mm_mul_ps(y, xmm3);
1647 */
1648 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1649 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1650 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1651
1652 /*
1653 * x = _mm_add_ps(x, xmm1);
1654 * x = _mm_add_ps(x, xmm2);
1655 * x = _mm_add_ps(x, xmm3);
1656 */
1657
1658 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1659 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1660 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1661
1662 /*
1663 * Evaluate the first polynom (0 <= x <= Pi/4)
1664 *
1665 * z = _mm_mul_ps(x,x);
1666 */
1667 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1668
1669 /*
1670 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1671 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1672 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1673 */
1674 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1675 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1676 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1677
1678 /*
1679 * y = *(v4sf*)_ps_coscof_p0;
1680 * y = _mm_mul_ps(y, z);
1681 */
1682 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1683 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1684 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1685 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1686 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1687 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1688
1689
1690 /*
1691 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1692 * y = _mm_sub_ps(y, tmp);
1693 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1694 */
1695 LLVMValueRef half = lp_build_const_v4sf(0.5);
1696 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1697 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1698 LLVMValueRef one = lp_build_const_v4sf(1.0);
1699 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1700
1701 /*
1702 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1703 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1704 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1705 */
1706 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1707 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1708 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1709
1710 /*
1711 * Evaluate the second polynom (Pi/4 <= x <= 0)
1712 *
1713 * y2 = *(v4sf*)_ps_sincof_p0;
1714 * y2 = _mm_mul_ps(y2, z);
1715 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1716 * y2 = _mm_mul_ps(y2, z);
1717 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1718 * y2 = _mm_mul_ps(y2, z);
1719 * y2 = _mm_mul_ps(y2, x);
1720 * y2 = _mm_add_ps(y2, x);
1721 */
1722
1723 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1724 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1725 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1726 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1727 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1728 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1729 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1730
1731 /*
1732 * select the correct result from the two polynoms
1733 * xmm3 = poly_mask;
1734 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1735 * y = _mm_andnot_ps(xmm3, y);
1736 * y = _mm_add_ps(y,y2);
1737 */
1738 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1739 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1740 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1741 LLVMValueRef inv = lp_build_const_v4si(~0);
1742 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1743 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1744 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1745
1746 /*
1747 * update the sign
1748 * y = _mm_xor_ps(y, sign_bit);
1749 */
1750 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1751 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1752 return y_result;
1753 }
1754
1755
1756 /**
1757 * Generate cos(a) using SSE2
1758 */
1759 LLVMValueRef
1760 lp_build_cos(struct lp_build_context *bld,
1761 LLVMValueRef a)
1762 {
1763 struct lp_type int_type = lp_int_type(bld->type);
1764 LLVMBuilderRef b = bld->builder;
1765 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1766 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1767
1768 /*
1769 * take the absolute value,
1770 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1771 */
1772
1773 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1774 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1775
1776 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1777 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1778
1779 /*
1780 * scale by 4/Pi
1781 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1782 */
1783
1784 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1785 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1786
1787 /*
1788 * store the integer part of y in mm0
1789 * emm2 = _mm_cvttps_epi32(y);
1790 */
1791
1792 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1793
1794 /*
1795 * j=(j+1) & (~1) (see the cephes sources)
1796 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1797 */
1798
1799 LLVMValueRef all_one = lp_build_const_v4si(1);
1800 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1801 /*
1802 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1803 */
1804 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1805 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1806
1807 /*
1808 * y = _mm_cvtepi32_ps(emm2);
1809 */
1810 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1811
1812
1813 /*
1814 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1815 */
1816 LLVMValueRef const_2 = lp_build_const_v4si(2);
1817 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1818
1819
1820 /* get the swap sign flag
1821 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1822 */
1823 LLVMValueRef inv = lp_build_const_v4si(~0);
1824 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1825 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1826 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1827
1828 /*
1829 * emm2 = _mm_slli_epi32(emm0, 29);
1830 */
1831 LLVMValueRef const_29 = lp_build_const_v4si(29);
1832 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1833
1834 /*
1835 * get the polynom selection mask
1836 * there is one polynom for 0 <= x <= Pi/4
1837 * and another one for Pi/4<x<=Pi/2
1838 * Both branches will be computed.
1839 *
1840 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1841 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1842 */
1843
1844 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1845 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1846 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1847 emm2_3, lp_build_const_v4si(0));
1848
1849 /*
1850 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1851 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1852 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1853 */
1854 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1855 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1856 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1857
1858 /*
1859 * The magic pass: "Extended precision modular arithmetic"
1860 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1861 * xmm1 = _mm_mul_ps(y, xmm1);
1862 * xmm2 = _mm_mul_ps(y, xmm2);
1863 * xmm3 = _mm_mul_ps(y, xmm3);
1864 */
1865 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1866 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1867 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1868
1869 /*
1870 * x = _mm_add_ps(x, xmm1);
1871 * x = _mm_add_ps(x, xmm2);
1872 * x = _mm_add_ps(x, xmm3);
1873 */
1874
1875 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1876 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1877 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1878
1879 /*
1880 * Evaluate the first polynom (0 <= x <= Pi/4)
1881 *
1882 * z = _mm_mul_ps(x,x);
1883 */
1884 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1885
1886 /*
1887 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1888 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1889 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1890 */
1891 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1892 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1893 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1894
1895 /*
1896 * y = *(v4sf*)_ps_coscof_p0;
1897 * y = _mm_mul_ps(y, z);
1898 */
1899 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1900 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1901 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1902 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1903 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1904 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1905
1906
1907 /*
1908 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1909 * y = _mm_sub_ps(y, tmp);
1910 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1911 */
1912 LLVMValueRef half = lp_build_const_v4sf(0.5);
1913 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1914 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1915 LLVMValueRef one = lp_build_const_v4sf(1.0);
1916 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1917
1918 /*
1919 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1920 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1921 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1922 */
1923 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1924 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1925 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1926
1927 /*
1928 * Evaluate the second polynom (Pi/4 <= x <= 0)
1929 *
1930 * y2 = *(v4sf*)_ps_sincof_p0;
1931 * y2 = _mm_mul_ps(y2, z);
1932 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1933 * y2 = _mm_mul_ps(y2, z);
1934 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1935 * y2 = _mm_mul_ps(y2, z);
1936 * y2 = _mm_mul_ps(y2, x);
1937 * y2 = _mm_add_ps(y2, x);
1938 */
1939
1940 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1941 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1942 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1943 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1944 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1945 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1946 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1947
1948 /*
1949 * select the correct result from the two polynoms
1950 * xmm3 = poly_mask;
1951 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1952 * y = _mm_andnot_ps(xmm3, y);
1953 * y = _mm_add_ps(y,y2);
1954 */
1955 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1956 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1957 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1958 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1959 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1960 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1961
1962 /*
1963 * update the sign
1964 * y = _mm_xor_ps(y, sign_bit);
1965 */
1966 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1967 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1968 return y_result;
1969 }
1970
1971
1972 /**
1973 * Generate pow(x, y)
1974 */
1975 LLVMValueRef
1976 lp_build_pow(struct lp_build_context *bld,
1977 LLVMValueRef x,
1978 LLVMValueRef y)
1979 {
1980 /* TODO: optimize the constant case */
1981 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
1982 LLVMIsConstant(x) && LLVMIsConstant(y)) {
1983 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1984 __FUNCTION__);
1985 }
1986
1987 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1988 }
1989
1990
1991 /**
1992 * Generate exp(x)
1993 */
1994 LLVMValueRef
1995 lp_build_exp(struct lp_build_context *bld,
1996 LLVMValueRef x)
1997 {
1998 /* log2(e) = 1/log(2) */
1999 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
2000
2001 assert(lp_check_value(bld->type, x));
2002
2003 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
2004 }
2005
2006
2007 /**
2008 * Generate log(x)
2009 */
2010 LLVMValueRef
2011 lp_build_log(struct lp_build_context *bld,
2012 LLVMValueRef x)
2013 {
2014 /* log(2) */
2015 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
2016
2017 assert(lp_check_value(bld->type, x));
2018
2019 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
2020 }
2021
2022
2023 /**
2024 * Generate polynomial.
2025 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2026 */
2027 static LLVMValueRef
2028 lp_build_polynomial(struct lp_build_context *bld,
2029 LLVMValueRef x,
2030 const double *coeffs,
2031 unsigned num_coeffs)
2032 {
2033 const struct lp_type type = bld->type;
2034 LLVMValueRef res = NULL;
2035 unsigned i;
2036
2037 assert(lp_check_value(bld->type, x));
2038
2039 /* TODO: optimize the constant case */
2040 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2041 LLVMIsConstant(x)) {
2042 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2043 __FUNCTION__);
2044 }
2045
2046 for (i = num_coeffs; i--; ) {
2047 LLVMValueRef coeff;
2048
2049 coeff = lp_build_const_vec(type, coeffs[i]);
2050
2051 if(res)
2052 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2053 else
2054 res = coeff;
2055 }
2056
2057 if(res)
2058 return res;
2059 else
2060 return bld->undef;
2061 }
2062
2063
2064 /**
2065 * Minimax polynomial fit of 2**x, in range [0, 1[
2066 */
2067 const double lp_build_exp2_polynomial[] = {
2068 #if EXP_POLY_DEGREE == 5
2069 0.999999999690134838155,
2070 0.583974334321735217258,
2071 0.164553105719676828492,
2072 0.0292811063701710962255,
2073 0.00354944426657875141846,
2074 0.000296253726543423377365
2075 #elif EXP_POLY_DEGREE == 4
2076 1.00000001502262084505,
2077 0.563586057338685991394,
2078 0.150436017652442413623,
2079 0.0243220604213317927308,
2080 0.0025359088446580436489
2081 #elif EXP_POLY_DEGREE == 3
2082 0.999925218562710312959,
2083 0.695833540494823811697,
2084 0.226067155427249155588,
2085 0.0780245226406372992967
2086 #elif EXP_POLY_DEGREE == 2
2087 1.00172476321474503578,
2088 0.657636275736077639316,
2089 0.33718943461968720704
2090 #else
2091 #error
2092 #endif
2093 };
2094
2095
2096 void
2097 lp_build_exp2_approx(struct lp_build_context *bld,
2098 LLVMValueRef x,
2099 LLVMValueRef *p_exp2_int_part,
2100 LLVMValueRef *p_frac_part,
2101 LLVMValueRef *p_exp2)
2102 {
2103 const struct lp_type type = bld->type;
2104 LLVMTypeRef vec_type = lp_build_vec_type(type);
2105 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2106 LLVMValueRef ipart = NULL;
2107 LLVMValueRef fpart = NULL;
2108 LLVMValueRef expipart = NULL;
2109 LLVMValueRef expfpart = NULL;
2110 LLVMValueRef res = NULL;
2111
2112 assert(lp_check_value(bld->type, x));
2113
2114 if(p_exp2_int_part || p_frac_part || p_exp2) {
2115 /* TODO: optimize the constant case */
2116 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2117 LLVMIsConstant(x)) {
2118 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2119 __FUNCTION__);
2120 }
2121
2122 assert(type.floating && type.width == 32);
2123
2124 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
2125 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
2126
2127 /* ipart = floor(x) */
2128 ipart = lp_build_floor(bld, x);
2129
2130 /* fpart = x - ipart */
2131 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
2132 }
2133
2134 if(p_exp2_int_part || p_exp2) {
2135 /* expipart = (float) (1 << ipart) */
2136 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
2137 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2138 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
2139 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
2140 }
2141
2142 if(p_exp2) {
2143 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2144 Elements(lp_build_exp2_polynomial));
2145
2146 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2147 }
2148
2149 if(p_exp2_int_part)
2150 *p_exp2_int_part = expipart;
2151
2152 if(p_frac_part)
2153 *p_frac_part = fpart;
2154
2155 if(p_exp2)
2156 *p_exp2 = res;
2157 }
2158
2159
2160 LLVMValueRef
2161 lp_build_exp2(struct lp_build_context *bld,
2162 LLVMValueRef x)
2163 {
2164 LLVMValueRef res;
2165 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2166 return res;
2167 }
2168
2169
2170 /**
2171 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2172 * These coefficients can be generate with
2173 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2174 */
2175 const double lp_build_log2_polynomial[] = {
2176 #if LOG_POLY_DEGREE == 6
2177 3.11578814719469302614,
2178 -3.32419399085241980044,
2179 2.59883907202499966007,
2180 -1.23152682416275988241,
2181 0.318212422185251071475,
2182 -0.0344359067839062357313
2183 #elif LOG_POLY_DEGREE == 5
2184 2.8882704548164776201,
2185 -2.52074962577807006663,
2186 1.48116647521213171641,
2187 -0.465725644288844778798,
2188 0.0596515482674574969533
2189 #elif LOG_POLY_DEGREE == 4
2190 2.61761038894603480148,
2191 -1.75647175389045657003,
2192 0.688243882994381274313,
2193 -0.107254423828329604454
2194 #elif LOG_POLY_DEGREE == 3
2195 2.28330284476918490682,
2196 -1.04913055217340124191,
2197 0.204446009836232697516
2198 #else
2199 #error
2200 #endif
2201 };
2202
2203
2204 /**
2205 * See http://www.devmaster.net/forums/showthread.php?p=43580
2206 */
2207 void
2208 lp_build_log2_approx(struct lp_build_context *bld,
2209 LLVMValueRef x,
2210 LLVMValueRef *p_exp,
2211 LLVMValueRef *p_floor_log2,
2212 LLVMValueRef *p_log2)
2213 {
2214 const struct lp_type type = bld->type;
2215 LLVMTypeRef vec_type = lp_build_vec_type(type);
2216 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2217
2218 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2219 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2220 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2221
2222 LLVMValueRef i = NULL;
2223 LLVMValueRef exp = NULL;
2224 LLVMValueRef mant = NULL;
2225 LLVMValueRef logexp = NULL;
2226 LLVMValueRef logmant = NULL;
2227 LLVMValueRef res = NULL;
2228
2229 assert(lp_check_value(bld->type, x));
2230
2231 if(p_exp || p_floor_log2 || p_log2) {
2232 /* TODO: optimize the constant case */
2233 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2234 LLVMIsConstant(x)) {
2235 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2236 __FUNCTION__);
2237 }
2238
2239 assert(type.floating && type.width == 32);
2240
2241 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2242
2243 /* exp = (float) exponent(x) */
2244 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2245 }
2246
2247 if(p_floor_log2 || p_log2) {
2248 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2249 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2250 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2251 }
2252
2253 if(p_log2) {
2254 /* mant = (float) mantissa(x) */
2255 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2256 mant = LLVMBuildOr(bld->builder, mant, one, "");
2257 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2258
2259 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2260 Elements(lp_build_log2_polynomial));
2261
2262 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2263 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2264
2265 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2266 }
2267
2268 if(p_exp) {
2269 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2270 *p_exp = exp;
2271 }
2272
2273 if(p_floor_log2)
2274 *p_floor_log2 = logexp;
2275
2276 if(p_log2)
2277 *p_log2 = res;
2278 }
2279
2280
2281 LLVMValueRef
2282 lp_build_log2(struct lp_build_context *bld,
2283 LLVMValueRef x)
2284 {
2285 LLVMValueRef res;
2286 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2287 return res;
2288 }
2289
2290
2291 /**
2292 * Faster (and less accurate) log2.
2293 *
2294 * log2(x) = floor(log2(x)) + frac(x)
2295 *
2296 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2297 */
2298 LLVMValueRef
2299 lp_build_fast_log2(struct lp_build_context *bld,
2300 LLVMValueRef x)
2301 {
2302 const struct lp_type type = bld->type;
2303 LLVMTypeRef vec_type = bld->vec_type;
2304 LLVMTypeRef int_vec_type = bld->int_vec_type;
2305
2306 unsigned mantissa = lp_mantissa(type);
2307 LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
2308 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2309
2310 LLVMValueRef ipart;
2311 LLVMValueRef fpart;
2312
2313 assert(lp_check_value(bld->type, x));
2314
2315 assert(type.floating);
2316
2317 x = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2318
2319 /* ipart = floor(log2(x)) - 1 */
2320 ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
2321 ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), "");
2322 ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 128), "");
2323 ipart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
2324
2325 /* fpart = 1.0 + frac(x) */
2326 fpart = LLVMBuildAnd(bld->builder, x, mantmask, "");
2327 fpart = LLVMBuildOr(bld->builder, fpart, one, "");
2328 fpart = LLVMBuildBitCast(bld->builder, fpart, vec_type, "");
2329
2330 /* floor(log2(x)) + frac(x) */
2331 return LLVMBuildFAdd(bld->builder, ipart, fpart, "");
2332 }