Merge branch 'master' of ssh://git.freedesktop.org/git/mesa/mesa into pipe-video
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_debug.h"
60 #include "lp_bld_arit.h"
61
62
63 #define EXP_POLY_DEGREE 3
64
65 #define LOG_POLY_DEGREE 5
66
67
68 /**
69 * Generate min(a, b)
70 * No checks for special case values of a or b = 1 or 0 are done.
71 */
72 static LLVMValueRef
73 lp_build_min_simple(struct lp_build_context *bld,
74 LLVMValueRef a,
75 LLVMValueRef b)
76 {
77 const struct lp_type type = bld->type;
78 const char *intrinsic = NULL;
79 LLVMValueRef cond;
80
81 assert(lp_check_value(type, a));
82 assert(lp_check_value(type, b));
83
84 /* TODO: optimize the constant case */
85
86 if(type.width * type.length == 128) {
87 if(type.floating) {
88 if(type.width == 32 && util_cpu_caps.has_sse)
89 intrinsic = "llvm.x86.sse.min.ps";
90 if(type.width == 64 && util_cpu_caps.has_sse2)
91 intrinsic = "llvm.x86.sse2.min.pd";
92 }
93 else {
94 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
95 intrinsic = "llvm.x86.sse2.pminu.b";
96 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
97 intrinsic = "llvm.x86.sse41.pminsb";
98 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminuw";
100 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
101 intrinsic = "llvm.x86.sse2.pmins.w";
102 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
103 intrinsic = "llvm.x86.sse41.pminud";
104 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
105 intrinsic = "llvm.x86.sse41.pminsd";
106 }
107 }
108
109 if(intrinsic)
110 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
111
112 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
113 return lp_build_select(bld, cond, a, b);
114 }
115
116
117 /**
118 * Generate max(a, b)
119 * No checks for special case values of a or b = 1 or 0 are done.
120 */
121 static LLVMValueRef
122 lp_build_max_simple(struct lp_build_context *bld,
123 LLVMValueRef a,
124 LLVMValueRef b)
125 {
126 const struct lp_type type = bld->type;
127 const char *intrinsic = NULL;
128 LLVMValueRef cond;
129
130 assert(lp_check_value(type, a));
131 assert(lp_check_value(type, b));
132
133 /* TODO: optimize the constant case */
134
135 if(type.width * type.length == 128) {
136 if(type.floating) {
137 if(type.width == 32 && util_cpu_caps.has_sse)
138 intrinsic = "llvm.x86.sse.max.ps";
139 if(type.width == 64 && util_cpu_caps.has_sse2)
140 intrinsic = "llvm.x86.sse2.max.pd";
141 }
142 else {
143 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
144 intrinsic = "llvm.x86.sse2.pmaxu.b";
145 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
146 intrinsic = "llvm.x86.sse41.pmaxsb";
147 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
148 intrinsic = "llvm.x86.sse41.pmaxuw";
149 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
150 intrinsic = "llvm.x86.sse2.pmaxs.w";
151 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
152 intrinsic = "llvm.x86.sse41.pmaxud";
153 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
154 intrinsic = "llvm.x86.sse41.pmaxsd";
155 }
156 }
157
158 if(intrinsic)
159 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
160
161 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
162 return lp_build_select(bld, cond, a, b);
163 }
164
165
166 /**
167 * Generate 1 - a, or ~a depending on bld->type.
168 */
169 LLVMValueRef
170 lp_build_comp(struct lp_build_context *bld,
171 LLVMValueRef a)
172 {
173 const struct lp_type type = bld->type;
174
175 assert(lp_check_value(type, a));
176
177 if(a == bld->one)
178 return bld->zero;
179 if(a == bld->zero)
180 return bld->one;
181
182 if(type.norm && !type.floating && !type.fixed && !type.sign) {
183 if(LLVMIsConstant(a))
184 return LLVMConstNot(a);
185 else
186 return LLVMBuildNot(bld->builder, a, "");
187 }
188
189 if(LLVMIsConstant(a))
190 if (type.floating)
191 return LLVMConstFSub(bld->one, a);
192 else
193 return LLVMConstSub(bld->one, a);
194 else
195 if (type.floating)
196 return LLVMBuildFSub(bld->builder, bld->one, a, "");
197 else
198 return LLVMBuildSub(bld->builder, bld->one, a, "");
199 }
200
201
202 /**
203 * Generate a + b
204 */
205 LLVMValueRef
206 lp_build_add(struct lp_build_context *bld,
207 LLVMValueRef a,
208 LLVMValueRef b)
209 {
210 const struct lp_type type = bld->type;
211 LLVMValueRef res;
212
213 assert(lp_check_value(type, a));
214 assert(lp_check_value(type, b));
215
216 if(a == bld->zero)
217 return b;
218 if(b == bld->zero)
219 return a;
220 if(a == bld->undef || b == bld->undef)
221 return bld->undef;
222
223 if(bld->type.norm) {
224 const char *intrinsic = NULL;
225
226 if(a == bld->one || b == bld->one)
227 return bld->one;
228
229 if(util_cpu_caps.has_sse2 &&
230 type.width * type.length == 128 &&
231 !type.floating && !type.fixed) {
232 if(type.width == 8)
233 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
234 if(type.width == 16)
235 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
236 }
237
238 if(intrinsic)
239 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
240 }
241
242 if(LLVMIsConstant(a) && LLVMIsConstant(b))
243 if (type.floating)
244 res = LLVMConstFAdd(a, b);
245 else
246 res = LLVMConstAdd(a, b);
247 else
248 if (type.floating)
249 res = LLVMBuildFAdd(bld->builder, a, b, "");
250 else
251 res = LLVMBuildAdd(bld->builder, a, b, "");
252
253 /* clamp to ceiling of 1.0 */
254 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
255 res = lp_build_min_simple(bld, res, bld->one);
256
257 /* XXX clamp to floor of -1 or 0??? */
258
259 return res;
260 }
261
262
263 /** Return the scalar sum of the elements of a */
264 LLVMValueRef
265 lp_build_sum_vector(struct lp_build_context *bld,
266 LLVMValueRef a)
267 {
268 const struct lp_type type = bld->type;
269 LLVMValueRef index, res;
270 unsigned i;
271
272 assert(lp_check_value(type, a));
273
274 if (type.length == 1) {
275 return a;
276 }
277
278 assert(!bld->type.norm);
279
280 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
281 res = LLVMBuildExtractElement(bld->builder, a, index, "");
282
283 for (i = 1; i < type.length; i++) {
284 index = LLVMConstInt(LLVMInt32Type(), i, 0);
285 if (type.floating)
286 res = LLVMBuildFAdd(bld->builder, res,
287 LLVMBuildExtractElement(bld->builder,
288 a, index, ""),
289 "");
290 else
291 res = LLVMBuildAdd(bld->builder, res,
292 LLVMBuildExtractElement(bld->builder,
293 a, index, ""),
294 "");
295 }
296
297 return res;
298 }
299
300
301 /**
302 * Generate a - b
303 */
304 LLVMValueRef
305 lp_build_sub(struct lp_build_context *bld,
306 LLVMValueRef a,
307 LLVMValueRef b)
308 {
309 const struct lp_type type = bld->type;
310 LLVMValueRef res;
311
312 assert(lp_check_value(type, a));
313 assert(lp_check_value(type, b));
314
315 if(b == bld->zero)
316 return a;
317 if(a == bld->undef || b == bld->undef)
318 return bld->undef;
319 if(a == b)
320 return bld->zero;
321
322 if(bld->type.norm) {
323 const char *intrinsic = NULL;
324
325 if(b == bld->one)
326 return bld->zero;
327
328 if(util_cpu_caps.has_sse2 &&
329 type.width * type.length == 128 &&
330 !type.floating && !type.fixed) {
331 if(type.width == 8)
332 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
333 if(type.width == 16)
334 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
335 }
336
337 if(intrinsic)
338 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
339 }
340
341 if(LLVMIsConstant(a) && LLVMIsConstant(b))
342 if (type.floating)
343 res = LLVMConstFSub(a, b);
344 else
345 res = LLVMConstSub(a, b);
346 else
347 if (type.floating)
348 res = LLVMBuildFSub(bld->builder, a, b, "");
349 else
350 res = LLVMBuildSub(bld->builder, a, b, "");
351
352 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
353 res = lp_build_max_simple(bld, res, bld->zero);
354
355 return res;
356 }
357
358
359 /**
360 * Normalized 8bit multiplication.
361 *
362 * - alpha plus one
363 *
364 * makes the following approximation to the division (Sree)
365 *
366 * a*b/255 ~= (a*(b + 1)) >> 256
367 *
368 * which is the fastest method that satisfies the following OpenGL criteria
369 *
370 * 0*0 = 0 and 255*255 = 255
371 *
372 * - geometric series
373 *
374 * takes the geometric series approximation to the division
375 *
376 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
377 *
378 * in this case just the first two terms to fit in 16bit arithmetic
379 *
380 * t/255 ~= (t + (t >> 8)) >> 8
381 *
382 * note that just by itself it doesn't satisfies the OpenGL criteria, as
383 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
384 * must be used
385 *
386 * - geometric series plus rounding
387 *
388 * when using a geometric series division instead of truncating the result
389 * use roundoff in the approximation (Jim Blinn)
390 *
391 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
392 *
393 * achieving the exact results
394 *
395 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
396 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
397 * @sa Michael Herf, The "double blend trick", May 2000,
398 * http://www.stereopsis.com/doubleblend.html
399 */
400 static LLVMValueRef
401 lp_build_mul_u8n(LLVMBuilderRef builder,
402 struct lp_type i16_type,
403 LLVMValueRef a, LLVMValueRef b)
404 {
405 LLVMValueRef c8;
406 LLVMValueRef ab;
407
408 assert(!i16_type.floating);
409 assert(lp_check_value(i16_type, a));
410 assert(lp_check_value(i16_type, b));
411
412 c8 = lp_build_const_int_vec(i16_type, 8);
413
414 #if 0
415
416 /* a*b/255 ~= (a*(b + 1)) >> 256 */
417 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
418 ab = LLVMBuildMul(builder, a, b, "");
419
420 #else
421
422 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
423 ab = LLVMBuildMul(builder, a, b, "");
424 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
425 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
426
427 #endif
428
429 ab = LLVMBuildLShr(builder, ab, c8, "");
430
431 return ab;
432 }
433
434
435 /**
436 * Generate a * b
437 */
438 LLVMValueRef
439 lp_build_mul(struct lp_build_context *bld,
440 LLVMValueRef a,
441 LLVMValueRef b)
442 {
443 const struct lp_type type = bld->type;
444 LLVMValueRef shift;
445 LLVMValueRef res;
446
447 assert(lp_check_value(type, a));
448 assert(lp_check_value(type, b));
449
450 if(a == bld->zero)
451 return bld->zero;
452 if(a == bld->one)
453 return b;
454 if(b == bld->zero)
455 return bld->zero;
456 if(b == bld->one)
457 return a;
458 if(a == bld->undef || b == bld->undef)
459 return bld->undef;
460
461 if(!type.floating && !type.fixed && type.norm) {
462 if(type.width == 8) {
463 struct lp_type i16_type = lp_wider_type(type);
464 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
465
466 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
467 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
468
469 /* PMULLW, PSRLW, PADDW */
470 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
471 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
472
473 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
474
475 return ab;
476 }
477
478 /* FIXME */
479 assert(0);
480 }
481
482 if(type.fixed)
483 shift = lp_build_const_int_vec(type, type.width/2);
484 else
485 shift = NULL;
486
487 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
488 if (type.floating)
489 res = LLVMConstFMul(a, b);
490 else
491 res = LLVMConstMul(a, b);
492 if(shift) {
493 if(type.sign)
494 res = LLVMConstAShr(res, shift);
495 else
496 res = LLVMConstLShr(res, shift);
497 }
498 }
499 else {
500 if (type.floating)
501 res = LLVMBuildFMul(bld->builder, a, b, "");
502 else
503 res = LLVMBuildMul(bld->builder, a, b, "");
504 if(shift) {
505 if(type.sign)
506 res = LLVMBuildAShr(bld->builder, res, shift, "");
507 else
508 res = LLVMBuildLShr(bld->builder, res, shift, "");
509 }
510 }
511
512 return res;
513 }
514
515
516 /**
517 * Small vector x scale multiplication optimization.
518 */
519 LLVMValueRef
520 lp_build_mul_imm(struct lp_build_context *bld,
521 LLVMValueRef a,
522 int b)
523 {
524 LLVMValueRef factor;
525
526 assert(lp_check_value(bld->type, a));
527
528 if(b == 0)
529 return bld->zero;
530
531 if(b == 1)
532 return a;
533
534 if(b == -1)
535 return lp_build_negate(bld, a);
536
537 if(b == 2 && bld->type.floating)
538 return lp_build_add(bld, a, a);
539
540 if(util_is_power_of_two(b)) {
541 unsigned shift = ffs(b) - 1;
542
543 if(bld->type.floating) {
544 #if 0
545 /*
546 * Power of two multiplication by directly manipulating the mantissa.
547 *
548 * XXX: This might not be always faster, it will introduce a small error
549 * for multiplication by zero, and it will produce wrong results
550 * for Inf and NaN.
551 */
552 unsigned mantissa = lp_mantissa(bld->type);
553 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
554 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
555 a = LLVMBuildAdd(bld->builder, a, factor, "");
556 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
557 return a;
558 #endif
559 }
560 else {
561 factor = lp_build_const_vec(bld->type, shift);
562 return LLVMBuildShl(bld->builder, a, factor, "");
563 }
564 }
565
566 factor = lp_build_const_vec(bld->type, (double)b);
567 return lp_build_mul(bld, a, factor);
568 }
569
570
571 /**
572 * Generate a / b
573 */
574 LLVMValueRef
575 lp_build_div(struct lp_build_context *bld,
576 LLVMValueRef a,
577 LLVMValueRef b)
578 {
579 const struct lp_type type = bld->type;
580
581 assert(lp_check_value(type, a));
582 assert(lp_check_value(type, b));
583
584 if(a == bld->zero)
585 return bld->zero;
586 if(a == bld->one)
587 return lp_build_rcp(bld, b);
588 if(b == bld->zero)
589 return bld->undef;
590 if(b == bld->one)
591 return a;
592 if(a == bld->undef || b == bld->undef)
593 return bld->undef;
594
595 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
596 if (type.floating)
597 return LLVMConstFDiv(a, b);
598 else if (type.sign)
599 return LLVMConstSDiv(a, b);
600 else
601 return LLVMConstUDiv(a, b);
602 }
603
604 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
605 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
606
607 if (type.floating)
608 return LLVMBuildFDiv(bld->builder, a, b, "");
609 else if (type.sign)
610 return LLVMBuildSDiv(bld->builder, a, b, "");
611 else
612 return LLVMBuildUDiv(bld->builder, a, b, "");
613 }
614
615
616 /**
617 * Linear interpolation -- without any checks.
618 *
619 * @sa http://www.stereopsis.com/doubleblend.html
620 */
621 static INLINE LLVMValueRef
622 lp_build_lerp_simple(struct lp_build_context *bld,
623 LLVMValueRef x,
624 LLVMValueRef v0,
625 LLVMValueRef v1)
626 {
627 LLVMValueRef delta;
628 LLVMValueRef res;
629
630 assert(lp_check_value(bld->type, x));
631 assert(lp_check_value(bld->type, v0));
632 assert(lp_check_value(bld->type, v1));
633
634 delta = lp_build_sub(bld, v1, v0);
635
636 res = lp_build_mul(bld, x, delta);
637
638 res = lp_build_add(bld, v0, res);
639
640 if (bld->type.fixed) {
641 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
642 * but it will be wrong for other uses. Basically we need a more
643 * powerful lp_type, capable of further distinguishing the values
644 * interpretation from the value storage. */
645 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
646 }
647
648 return res;
649 }
650
651
652 /**
653 * Linear interpolation.
654 */
655 LLVMValueRef
656 lp_build_lerp(struct lp_build_context *bld,
657 LLVMValueRef x,
658 LLVMValueRef v0,
659 LLVMValueRef v1)
660 {
661 const struct lp_type type = bld->type;
662 LLVMValueRef res;
663
664 assert(lp_check_value(type, x));
665 assert(lp_check_value(type, v0));
666 assert(lp_check_value(type, v1));
667
668 if (type.norm) {
669 struct lp_type wide_type;
670 struct lp_build_context wide_bld;
671 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
672 LLVMValueRef shift;
673
674 assert(type.length >= 2);
675 assert(!type.sign);
676
677 /*
678 * Create a wider type, enough to hold the intermediate result of the
679 * multiplication.
680 */
681 memset(&wide_type, 0, sizeof wide_type);
682 wide_type.fixed = TRUE;
683 wide_type.width = type.width*2;
684 wide_type.length = type.length/2;
685
686 lp_build_context_init(&wide_bld, bld->builder, wide_type);
687
688 lp_build_unpack2(bld->builder, type, wide_type, x, &xl, &xh);
689 lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h);
690 lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h);
691
692 /*
693 * Scale x from [0, 255] to [0, 256]
694 */
695
696 shift = lp_build_const_int_vec(wide_type, type.width - 1);
697
698 xl = lp_build_add(&wide_bld, xl,
699 LLVMBuildAShr(bld->builder, xl, shift, ""));
700 xh = lp_build_add(&wide_bld, xh,
701 LLVMBuildAShr(bld->builder, xh, shift, ""));
702
703 /*
704 * Lerp both halves.
705 */
706
707 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
708 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
709
710 res = lp_build_pack2(bld->builder, wide_type, type, resl, resh);
711 } else {
712 res = lp_build_lerp_simple(bld, x, v0, v1);
713 }
714
715 return res;
716 }
717
718
719 LLVMValueRef
720 lp_build_lerp_2d(struct lp_build_context *bld,
721 LLVMValueRef x,
722 LLVMValueRef y,
723 LLVMValueRef v00,
724 LLVMValueRef v01,
725 LLVMValueRef v10,
726 LLVMValueRef v11)
727 {
728 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
729 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
730 return lp_build_lerp(bld, y, v0, v1);
731 }
732
733
734 /**
735 * Generate min(a, b)
736 * Do checks for special cases.
737 */
738 LLVMValueRef
739 lp_build_min(struct lp_build_context *bld,
740 LLVMValueRef a,
741 LLVMValueRef b)
742 {
743 assert(lp_check_value(bld->type, a));
744 assert(lp_check_value(bld->type, b));
745
746 if(a == bld->undef || b == bld->undef)
747 return bld->undef;
748
749 if(a == b)
750 return a;
751
752 if(bld->type.norm) {
753 if(a == bld->zero || b == bld->zero)
754 return bld->zero;
755 if(a == bld->one)
756 return b;
757 if(b == bld->one)
758 return a;
759 }
760
761 return lp_build_min_simple(bld, a, b);
762 }
763
764
765 /**
766 * Generate max(a, b)
767 * Do checks for special cases.
768 */
769 LLVMValueRef
770 lp_build_max(struct lp_build_context *bld,
771 LLVMValueRef a,
772 LLVMValueRef b)
773 {
774 assert(lp_check_value(bld->type, a));
775 assert(lp_check_value(bld->type, b));
776
777 if(a == bld->undef || b == bld->undef)
778 return bld->undef;
779
780 if(a == b)
781 return a;
782
783 if(bld->type.norm) {
784 if(a == bld->one || b == bld->one)
785 return bld->one;
786 if(a == bld->zero)
787 return b;
788 if(b == bld->zero)
789 return a;
790 }
791
792 return lp_build_max_simple(bld, a, b);
793 }
794
795
796 /**
797 * Generate clamp(a, min, max)
798 * Do checks for special cases.
799 */
800 LLVMValueRef
801 lp_build_clamp(struct lp_build_context *bld,
802 LLVMValueRef a,
803 LLVMValueRef min,
804 LLVMValueRef max)
805 {
806 assert(lp_check_value(bld->type, a));
807 assert(lp_check_value(bld->type, min));
808 assert(lp_check_value(bld->type, max));
809
810 a = lp_build_min(bld, a, max);
811 a = lp_build_max(bld, a, min);
812 return a;
813 }
814
815
816 /**
817 * Generate abs(a)
818 */
819 LLVMValueRef
820 lp_build_abs(struct lp_build_context *bld,
821 LLVMValueRef a)
822 {
823 const struct lp_type type = bld->type;
824 LLVMTypeRef vec_type = lp_build_vec_type(type);
825
826 assert(lp_check_value(type, a));
827
828 if(!type.sign)
829 return a;
830
831 if(type.floating) {
832 /* Mask out the sign bit */
833 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
834 unsigned long long absMask = ~(1ULL << (type.width - 1));
835 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
836 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
837 a = LLVMBuildAnd(bld->builder, a, mask, "");
838 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
839 return a;
840 }
841
842 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
843 switch(type.width) {
844 case 8:
845 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
846 case 16:
847 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
848 case 32:
849 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
850 }
851 }
852
853 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
854 }
855
856
857 LLVMValueRef
858 lp_build_negate(struct lp_build_context *bld,
859 LLVMValueRef a)
860 {
861 assert(lp_check_value(bld->type, a));
862
863 #if HAVE_LLVM >= 0x0207
864 if (bld->type.floating)
865 a = LLVMBuildFNeg(bld->builder, a, "");
866 else
867 #endif
868 a = LLVMBuildNeg(bld->builder, a, "");
869
870 return a;
871 }
872
873
874 /** Return -1, 0 or +1 depending on the sign of a */
875 LLVMValueRef
876 lp_build_sgn(struct lp_build_context *bld,
877 LLVMValueRef a)
878 {
879 const struct lp_type type = bld->type;
880 LLVMValueRef cond;
881 LLVMValueRef res;
882
883 assert(lp_check_value(type, a));
884
885 /* Handle non-zero case */
886 if(!type.sign) {
887 /* if not zero then sign must be positive */
888 res = bld->one;
889 }
890 else if(type.floating) {
891 LLVMTypeRef vec_type;
892 LLVMTypeRef int_type;
893 LLVMValueRef mask;
894 LLVMValueRef sign;
895 LLVMValueRef one;
896 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
897
898 int_type = lp_build_int_vec_type(type);
899 vec_type = lp_build_vec_type(type);
900 mask = lp_build_const_int_vec(type, maskBit);
901
902 /* Take the sign bit and add it to 1 constant */
903 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
904 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
905 one = LLVMConstBitCast(bld->one, int_type);
906 res = LLVMBuildOr(bld->builder, sign, one, "");
907 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
908 }
909 else
910 {
911 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
912 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
913 res = lp_build_select(bld, cond, bld->one, minus_one);
914 }
915
916 /* Handle zero */
917 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
918 res = lp_build_select(bld, cond, bld->zero, res);
919
920 return res;
921 }
922
923
924 /**
925 * Set the sign of float vector 'a' according to 'sign'.
926 * If sign==0, return abs(a).
927 * If sign==1, return -abs(a);
928 * Other values for sign produce undefined results.
929 */
930 LLVMValueRef
931 lp_build_set_sign(struct lp_build_context *bld,
932 LLVMValueRef a, LLVMValueRef sign)
933 {
934 const struct lp_type type = bld->type;
935 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
936 LLVMTypeRef vec_type = lp_build_vec_type(type);
937 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
938 LLVMValueRef mask = lp_build_const_int_vec(type,
939 ~((unsigned long long) 1 << (type.width - 1)));
940 LLVMValueRef val, res;
941
942 assert(type.floating);
943 assert(lp_check_value(type, a));
944
945 /* val = reinterpret_cast<int>(a) */
946 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
947 /* val = val & mask */
948 val = LLVMBuildAnd(bld->builder, val, mask, "");
949 /* sign = sign << shift */
950 sign = LLVMBuildShl(bld->builder, sign, shift, "");
951 /* res = val | sign */
952 res = LLVMBuildOr(bld->builder, val, sign, "");
953 /* res = reinterpret_cast<float>(res) */
954 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
955
956 return res;
957 }
958
959
960 /**
961 * Convert vector of (or scalar) int to vector of (or scalar) float.
962 */
963 LLVMValueRef
964 lp_build_int_to_float(struct lp_build_context *bld,
965 LLVMValueRef a)
966 {
967 const struct lp_type type = bld->type;
968 LLVMTypeRef vec_type = lp_build_vec_type(type);
969
970 assert(type.floating);
971
972 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
973 }
974
975
976
977 enum lp_build_round_sse41_mode
978 {
979 LP_BUILD_ROUND_SSE41_NEAREST = 0,
980 LP_BUILD_ROUND_SSE41_FLOOR = 1,
981 LP_BUILD_ROUND_SSE41_CEIL = 2,
982 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
983 };
984
985
986 static INLINE LLVMValueRef
987 lp_build_round_sse41(struct lp_build_context *bld,
988 LLVMValueRef a,
989 enum lp_build_round_sse41_mode mode)
990 {
991 const struct lp_type type = bld->type;
992 LLVMTypeRef i32t = LLVMInt32Type();
993 const char *intrinsic;
994 LLVMValueRef res;
995
996 assert(type.floating);
997
998 assert(lp_check_value(type, a));
999 assert(util_cpu_caps.has_sse4_1);
1000
1001 if (type.length == 1) {
1002 LLVMTypeRef vec_type;
1003 LLVMValueRef undef;
1004 LLVMValueRef args[3];
1005 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1006
1007 switch(type.width) {
1008 case 32:
1009 intrinsic = "llvm.x86.sse41.round.ss";
1010 break;
1011 case 64:
1012 intrinsic = "llvm.x86.sse41.round.sd";
1013 break;
1014 default:
1015 assert(0);
1016 return bld->undef;
1017 }
1018
1019 vec_type = LLVMVectorType(bld->elem_type, 4);
1020
1021 undef = LLVMGetUndef(vec_type);
1022
1023 args[0] = undef;
1024 args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
1025 args[2] = LLVMConstInt(i32t, mode, 0);
1026
1027 res = lp_build_intrinsic(bld->builder, intrinsic,
1028 vec_type, args, Elements(args));
1029
1030 res = LLVMBuildExtractElement(bld->builder, res, index0, "");
1031 }
1032 else {
1033 assert(type.width*type.length == 128);
1034
1035 switch(type.width) {
1036 case 32:
1037 intrinsic = "llvm.x86.sse41.round.ps";
1038 break;
1039 case 64:
1040 intrinsic = "llvm.x86.sse41.round.pd";
1041 break;
1042 default:
1043 assert(0);
1044 return bld->undef;
1045 }
1046
1047 res = lp_build_intrinsic_binary(bld->builder, intrinsic,
1048 bld->vec_type, a,
1049 LLVMConstInt(i32t, mode, 0));
1050 }
1051
1052 return res;
1053 }
1054
1055
1056 /**
1057 * Return the integer part of a float (vector) value. The returned value is
1058 * a float (vector).
1059 * Ex: trunc(-1.5) = 1.0
1060 */
1061 LLVMValueRef
1062 lp_build_trunc(struct lp_build_context *bld,
1063 LLVMValueRef a)
1064 {
1065 const struct lp_type type = bld->type;
1066
1067 assert(type.floating);
1068 assert(lp_check_value(type, a));
1069
1070 if (util_cpu_caps.has_sse4_1 &&
1071 (type.length == 1 || type.width*type.length == 128)) {
1072 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1073 }
1074 else {
1075 LLVMTypeRef vec_type = lp_build_vec_type(type);
1076 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1077 LLVMValueRef res;
1078 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1079 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1080 return res;
1081 }
1082 }
1083
1084
1085 /**
1086 * Return float (vector) rounded to nearest integer (vector). The returned
1087 * value is a float (vector).
1088 * Ex: round(0.9) = 1.0
1089 * Ex: round(-1.5) = -2.0
1090 */
1091 LLVMValueRef
1092 lp_build_round(struct lp_build_context *bld,
1093 LLVMValueRef a)
1094 {
1095 const struct lp_type type = bld->type;
1096
1097 assert(type.floating);
1098 assert(lp_check_value(type, a));
1099
1100 if (util_cpu_caps.has_sse4_1 &&
1101 (type.length == 1 || type.width*type.length == 128)) {
1102 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1103 }
1104 else {
1105 LLVMTypeRef vec_type = lp_build_vec_type(type);
1106 LLVMValueRef res;
1107 res = lp_build_iround(bld, a);
1108 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1109 return res;
1110 }
1111 }
1112
1113
1114 /**
1115 * Return floor of float (vector), result is a float (vector)
1116 * Ex: floor(1.1) = 1.0
1117 * Ex: floor(-1.1) = -2.0
1118 */
1119 LLVMValueRef
1120 lp_build_floor(struct lp_build_context *bld,
1121 LLVMValueRef a)
1122 {
1123 const struct lp_type type = bld->type;
1124
1125 assert(type.floating);
1126 assert(lp_check_value(type, a));
1127
1128 if (util_cpu_caps.has_sse4_1 &&
1129 (type.length == 1 || type.width*type.length == 128)) {
1130 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1131 }
1132 else {
1133 LLVMTypeRef vec_type = lp_build_vec_type(type);
1134 LLVMValueRef res;
1135 res = lp_build_ifloor(bld, a);
1136 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1137 return res;
1138 }
1139 }
1140
1141
1142 /**
1143 * Return ceiling of float (vector), returning float (vector).
1144 * Ex: ceil( 1.1) = 2.0
1145 * Ex: ceil(-1.1) = -1.0
1146 */
1147 LLVMValueRef
1148 lp_build_ceil(struct lp_build_context *bld,
1149 LLVMValueRef a)
1150 {
1151 const struct lp_type type = bld->type;
1152
1153 assert(type.floating);
1154 assert(lp_check_value(type, a));
1155
1156 if (util_cpu_caps.has_sse4_1 &&
1157 (type.length == 1 || type.width*type.length == 128)) {
1158 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1159 }
1160 else {
1161 LLVMTypeRef vec_type = lp_build_vec_type(type);
1162 LLVMValueRef res;
1163 res = lp_build_iceil(bld, a);
1164 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1165 return res;
1166 }
1167 }
1168
1169
1170 /**
1171 * Return fractional part of 'a' computed as a - floor(a)
1172 * Typically used in texture coord arithmetic.
1173 */
1174 LLVMValueRef
1175 lp_build_fract(struct lp_build_context *bld,
1176 LLVMValueRef a)
1177 {
1178 assert(bld->type.floating);
1179 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1180 }
1181
1182
1183 /**
1184 * Return the integer part of a float (vector) value. The returned value is
1185 * an integer (vector).
1186 * Ex: itrunc(-1.5) = 1
1187 */
1188 LLVMValueRef
1189 lp_build_itrunc(struct lp_build_context *bld,
1190 LLVMValueRef a)
1191 {
1192 const struct lp_type type = bld->type;
1193 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1194
1195 assert(type.floating);
1196 assert(lp_check_value(type, a));
1197
1198 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1199 }
1200
1201
1202 /**
1203 * Return float (vector) rounded to nearest integer (vector). The returned
1204 * value is an integer (vector).
1205 * Ex: iround(0.9) = 1
1206 * Ex: iround(-1.5) = -2
1207 */
1208 LLVMValueRef
1209 lp_build_iround(struct lp_build_context *bld,
1210 LLVMValueRef a)
1211 {
1212 const struct lp_type type = bld->type;
1213 LLVMTypeRef int_vec_type = bld->int_vec_type;
1214 LLVMValueRef res;
1215
1216 assert(type.floating);
1217
1218 assert(lp_check_value(type, a));
1219
1220 if (util_cpu_caps.has_sse4_1 &&
1221 (type.length == 1 || type.width*type.length == 128)) {
1222 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1223 }
1224 else {
1225 LLVMValueRef half;
1226
1227 half = lp_build_const_vec(type, 0.5);
1228
1229 if (type.sign) {
1230 LLVMTypeRef vec_type = bld->vec_type;
1231 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1232 LLVMValueRef sign;
1233
1234 /* get sign bit */
1235 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1236 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1237
1238 /* sign * 0.5 */
1239 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1240 half = LLVMBuildOr(bld->builder, sign, half, "");
1241 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1242 }
1243
1244 res = LLVMBuildFAdd(bld->builder, a, half, "");
1245 }
1246
1247 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1248
1249 return res;
1250 }
1251
1252
1253 /**
1254 * Return floor of float (vector), result is an int (vector)
1255 * Ex: ifloor(1.1) = 1.0
1256 * Ex: ifloor(-1.1) = -2.0
1257 */
1258 LLVMValueRef
1259 lp_build_ifloor(struct lp_build_context *bld,
1260 LLVMValueRef a)
1261 {
1262 const struct lp_type type = bld->type;
1263 LLVMTypeRef int_vec_type = bld->int_vec_type;
1264 LLVMValueRef res;
1265
1266 assert(type.floating);
1267 assert(lp_check_value(type, a));
1268
1269 if (util_cpu_caps.has_sse4_1 &&
1270 (type.length == 1 || type.width*type.length == 128)) {
1271 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1272 }
1273 else {
1274 res = a;
1275
1276 if (type.sign) {
1277 /* Take the sign bit and add it to 1 constant */
1278 LLVMTypeRef vec_type = bld->vec_type;
1279 unsigned mantissa = lp_mantissa(type);
1280 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1281 LLVMValueRef sign;
1282 LLVMValueRef offset;
1283
1284 /* sign = a < 0 ? ~0 : 0 */
1285 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1286 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1287 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1288
1289 /* offset = -0.99999(9)f */
1290 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1291 offset = LLVMConstBitCast(offset, int_vec_type);
1292
1293 /* offset = a < 0 ? offset : 0.0f */
1294 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1295 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1296
1297 res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res");
1298 }
1299 }
1300
1301 /* round to nearest (toward zero) */
1302 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1303
1304 return res;
1305 }
1306
1307
1308 /**
1309 * Return ceiling of float (vector), returning int (vector).
1310 * Ex: iceil( 1.1) = 2
1311 * Ex: iceil(-1.1) = -1
1312 */
1313 LLVMValueRef
1314 lp_build_iceil(struct lp_build_context *bld,
1315 LLVMValueRef a)
1316 {
1317 const struct lp_type type = bld->type;
1318 LLVMTypeRef int_vec_type = bld->int_vec_type;
1319 LLVMValueRef res;
1320
1321 assert(type.floating);
1322 assert(lp_check_value(type, a));
1323
1324 if (util_cpu_caps.has_sse4_1 &&
1325 (type.length == 1 || type.width*type.length == 128)) {
1326 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1327 }
1328 else {
1329 LLVMTypeRef vec_type = bld->vec_type;
1330 unsigned mantissa = lp_mantissa(type);
1331 LLVMValueRef offset;
1332
1333 /* offset = 0.99999(9)f */
1334 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1335
1336 if (type.sign) {
1337 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1338 LLVMValueRef sign;
1339
1340 /* sign = a < 0 ? 0 : ~0 */
1341 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1342 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1343 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1344 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1345
1346 /* offset = a < 0 ? 0.0 : offset */
1347 offset = LLVMConstBitCast(offset, int_vec_type);
1348 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1349 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1350 }
1351
1352 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1353 }
1354
1355 /* round to nearest (toward zero) */
1356 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1357
1358 return res;
1359 }
1360
1361
1362 /**
1363 * Combined ifloor() & fract().
1364 *
1365 * Preferred to calling the functions separately, as it will ensure that the
1366 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1367 */
1368 void
1369 lp_build_ifloor_fract(struct lp_build_context *bld,
1370 LLVMValueRef a,
1371 LLVMValueRef *out_ipart,
1372 LLVMValueRef *out_fpart)
1373 {
1374
1375
1376 const struct lp_type type = bld->type;
1377 LLVMValueRef ipart;
1378
1379 assert(type.floating);
1380 assert(lp_check_value(type, a));
1381
1382 if (util_cpu_caps.has_sse4_1 &&
1383 (type.length == 1 || type.width*type.length == 128)) {
1384 /*
1385 * floor() is easier.
1386 */
1387
1388 ipart = lp_build_floor(bld, a);
1389 *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
1390 *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart");
1391 }
1392 else {
1393 /*
1394 * ifloor() is easier.
1395 */
1396
1397 *out_ipart = lp_build_ifloor(bld, a);
1398 ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart");
1399 *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
1400 }
1401 }
1402
1403
1404 LLVMValueRef
1405 lp_build_sqrt(struct lp_build_context *bld,
1406 LLVMValueRef a)
1407 {
1408 const struct lp_type type = bld->type;
1409 LLVMTypeRef vec_type = lp_build_vec_type(type);
1410 char intrinsic[32];
1411
1412 assert(lp_check_value(type, a));
1413
1414 /* TODO: optimize the constant case */
1415 /* TODO: optimize the constant case */
1416
1417 assert(type.floating);
1418 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1419
1420 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1421 }
1422
1423
1424 /**
1425 * Do one Newton-Raphson step to improve reciprocate precision:
1426 *
1427 * x_{i+1} = x_i * (2 - a * x_i)
1428 *
1429 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1430 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1431 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1432 * halo. It would be necessary to clamp the argument to prevent this.
1433 *
1434 * See also:
1435 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1436 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1437 */
1438 static INLINE LLVMValueRef
1439 lp_build_rcp_refine(struct lp_build_context *bld,
1440 LLVMValueRef a,
1441 LLVMValueRef rcp_a)
1442 {
1443 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1444 LLVMValueRef res;
1445
1446 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1447 res = LLVMBuildFSub(bld->builder, two, res, "");
1448 res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1449
1450 return res;
1451 }
1452
1453
1454 LLVMValueRef
1455 lp_build_rcp(struct lp_build_context *bld,
1456 LLVMValueRef a)
1457 {
1458 const struct lp_type type = bld->type;
1459
1460 assert(lp_check_value(type, a));
1461
1462 if(a == bld->zero)
1463 return bld->undef;
1464 if(a == bld->one)
1465 return bld->one;
1466 if(a == bld->undef)
1467 return bld->undef;
1468
1469 assert(type.floating);
1470
1471 if(LLVMIsConstant(a))
1472 return LLVMConstFDiv(bld->one, a);
1473
1474 /*
1475 * We don't use RCPPS because:
1476 * - it only has 10bits of precision
1477 * - it doesn't even get the reciprocate of 1.0 exactly
1478 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1479 * - for recent processors the benefit over DIVPS is marginal, a case
1480 * depedent
1481 *
1482 * We could still use it on certain processors if benchmarks show that the
1483 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1484 * particular uses that require less workarounds.
1485 */
1486
1487 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1488 const unsigned num_iterations = 0;
1489 LLVMValueRef res;
1490 unsigned i;
1491
1492 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1493
1494 for (i = 0; i < num_iterations; ++i) {
1495 res = lp_build_rcp_refine(bld, a, res);
1496 }
1497
1498 return res;
1499 }
1500
1501 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1502 }
1503
1504
1505 /**
1506 * Do one Newton-Raphson step to improve rsqrt precision:
1507 *
1508 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1509 *
1510 * See also:
1511 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1512 */
1513 static INLINE LLVMValueRef
1514 lp_build_rsqrt_refine(struct lp_build_context *bld,
1515 LLVMValueRef a,
1516 LLVMValueRef rsqrt_a)
1517 {
1518 LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1519 LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1520 LLVMValueRef res;
1521
1522 res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1523 res = LLVMBuildFMul(bld->builder, a, res, "");
1524 res = LLVMBuildFSub(bld->builder, three, res, "");
1525 res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1526 res = LLVMBuildFMul(bld->builder, half, res, "");
1527
1528 return res;
1529 }
1530
1531
1532 /**
1533 * Generate 1/sqrt(a)
1534 */
1535 LLVMValueRef
1536 lp_build_rsqrt(struct lp_build_context *bld,
1537 LLVMValueRef a)
1538 {
1539 const struct lp_type type = bld->type;
1540
1541 assert(lp_check_value(type, a));
1542
1543 assert(type.floating);
1544
1545 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1546 const unsigned num_iterations = 0;
1547 LLVMValueRef res;
1548 unsigned i;
1549
1550 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1551
1552 for (i = 0; i < num_iterations; ++i) {
1553 res = lp_build_rsqrt_refine(bld, a, res);
1554 }
1555
1556 return res;
1557 }
1558
1559 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1560 }
1561
1562
1563 static inline LLVMValueRef
1564 lp_build_const_v4si(unsigned long value)
1565 {
1566 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1567 LLVMValueRef elements[4] = { element, element, element, element };
1568 return LLVMConstVector(elements, 4);
1569 }
1570
1571 static inline LLVMValueRef
1572 lp_build_const_v4sf(float value)
1573 {
1574 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1575 LLVMValueRef elements[4] = { element, element, element, element };
1576 return LLVMConstVector(elements, 4);
1577 }
1578
1579
1580 /**
1581 * Generate sin(a) using SSE2
1582 */
1583 LLVMValueRef
1584 lp_build_sin(struct lp_build_context *bld,
1585 LLVMValueRef a)
1586 {
1587 struct lp_type int_type = lp_int_type(bld->type);
1588 LLVMBuilderRef b = bld->builder;
1589 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1590 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1591
1592 /*
1593 * take the absolute value,
1594 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1595 */
1596
1597 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1598 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1599
1600 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1601 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1602
1603 /*
1604 * extract the sign bit (upper one)
1605 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1606 */
1607 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1608 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1609
1610 /*
1611 * scale by 4/Pi
1612 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1613 */
1614
1615 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1616 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1617
1618 /*
1619 * store the integer part of y in mm0
1620 * emm2 = _mm_cvttps_epi32(y);
1621 */
1622
1623 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1624
1625 /*
1626 * j=(j+1) & (~1) (see the cephes sources)
1627 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1628 */
1629
1630 LLVMValueRef all_one = lp_build_const_v4si(1);
1631 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1632 /*
1633 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1634 */
1635 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1636 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1637
1638 /*
1639 * y = _mm_cvtepi32_ps(emm2);
1640 */
1641 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1642
1643 /* get the swap sign flag
1644 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1645 */
1646 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1647 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1648
1649 /*
1650 * emm2 = _mm_slli_epi32(emm0, 29);
1651 */
1652 LLVMValueRef const_29 = lp_build_const_v4si(29);
1653 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1654
1655 /*
1656 * get the polynom selection mask
1657 * there is one polynom for 0 <= x <= Pi/4
1658 * and another one for Pi/4<x<=Pi/2
1659 * Both branches will be computed.
1660 *
1661 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1662 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1663 */
1664
1665 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1666 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1667 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1668 emm2_3, lp_build_const_v4si(0));
1669 /*
1670 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1671 */
1672 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1673
1674 /*
1675 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1676 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1677 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1678 */
1679 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1680 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1681 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1682
1683 /*
1684 * The magic pass: "Extended precision modular arithmetic"
1685 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1686 * xmm1 = _mm_mul_ps(y, xmm1);
1687 * xmm2 = _mm_mul_ps(y, xmm2);
1688 * xmm3 = _mm_mul_ps(y, xmm3);
1689 */
1690 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1691 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1692 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1693
1694 /*
1695 * x = _mm_add_ps(x, xmm1);
1696 * x = _mm_add_ps(x, xmm2);
1697 * x = _mm_add_ps(x, xmm3);
1698 */
1699
1700 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1701 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1702 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1703
1704 /*
1705 * Evaluate the first polynom (0 <= x <= Pi/4)
1706 *
1707 * z = _mm_mul_ps(x,x);
1708 */
1709 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1710
1711 /*
1712 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1713 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1714 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1715 */
1716 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1717 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1718 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1719
1720 /*
1721 * y = *(v4sf*)_ps_coscof_p0;
1722 * y = _mm_mul_ps(y, z);
1723 */
1724 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1725 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1726 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1727 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1728 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1729 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1730
1731
1732 /*
1733 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1734 * y = _mm_sub_ps(y, tmp);
1735 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1736 */
1737 LLVMValueRef half = lp_build_const_v4sf(0.5);
1738 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1739 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1740 LLVMValueRef one = lp_build_const_v4sf(1.0);
1741 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1742
1743 /*
1744 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1745 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1746 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1747 */
1748 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1749 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1750 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1751
1752 /*
1753 * Evaluate the second polynom (Pi/4 <= x <= 0)
1754 *
1755 * y2 = *(v4sf*)_ps_sincof_p0;
1756 * y2 = _mm_mul_ps(y2, z);
1757 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1758 * y2 = _mm_mul_ps(y2, z);
1759 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1760 * y2 = _mm_mul_ps(y2, z);
1761 * y2 = _mm_mul_ps(y2, x);
1762 * y2 = _mm_add_ps(y2, x);
1763 */
1764
1765 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1766 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1767 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1768 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1769 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1770 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1771 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1772
1773 /*
1774 * select the correct result from the two polynoms
1775 * xmm3 = poly_mask;
1776 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1777 * y = _mm_andnot_ps(xmm3, y);
1778 * y = _mm_add_ps(y,y2);
1779 */
1780 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1781 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1782 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1783 LLVMValueRef inv = lp_build_const_v4si(~0);
1784 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1785 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1786 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1787
1788 /*
1789 * update the sign
1790 * y = _mm_xor_ps(y, sign_bit);
1791 */
1792 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1793 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1794 return y_result;
1795 }
1796
1797
1798 /**
1799 * Generate cos(a) using SSE2
1800 */
1801 LLVMValueRef
1802 lp_build_cos(struct lp_build_context *bld,
1803 LLVMValueRef a)
1804 {
1805 struct lp_type int_type = lp_int_type(bld->type);
1806 LLVMBuilderRef b = bld->builder;
1807 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1808 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1809
1810 /*
1811 * take the absolute value,
1812 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1813 */
1814
1815 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1816 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1817
1818 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1819 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1820
1821 /*
1822 * scale by 4/Pi
1823 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1824 */
1825
1826 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1827 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1828
1829 /*
1830 * store the integer part of y in mm0
1831 * emm2 = _mm_cvttps_epi32(y);
1832 */
1833
1834 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1835
1836 /*
1837 * j=(j+1) & (~1) (see the cephes sources)
1838 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1839 */
1840
1841 LLVMValueRef all_one = lp_build_const_v4si(1);
1842 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1843 /*
1844 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1845 */
1846 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1847 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1848
1849 /*
1850 * y = _mm_cvtepi32_ps(emm2);
1851 */
1852 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1853
1854
1855 /*
1856 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1857 */
1858 LLVMValueRef const_2 = lp_build_const_v4si(2);
1859 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1860
1861
1862 /* get the swap sign flag
1863 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1864 */
1865 LLVMValueRef inv = lp_build_const_v4si(~0);
1866 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1867 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1868 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1869
1870 /*
1871 * emm2 = _mm_slli_epi32(emm0, 29);
1872 */
1873 LLVMValueRef const_29 = lp_build_const_v4si(29);
1874 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1875
1876 /*
1877 * get the polynom selection mask
1878 * there is one polynom for 0 <= x <= Pi/4
1879 * and another one for Pi/4<x<=Pi/2
1880 * Both branches will be computed.
1881 *
1882 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1883 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1884 */
1885
1886 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1887 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1888 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1889 emm2_3, lp_build_const_v4si(0));
1890
1891 /*
1892 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1893 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1894 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1895 */
1896 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1897 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1898 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1899
1900 /*
1901 * The magic pass: "Extended precision modular arithmetic"
1902 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1903 * xmm1 = _mm_mul_ps(y, xmm1);
1904 * xmm2 = _mm_mul_ps(y, xmm2);
1905 * xmm3 = _mm_mul_ps(y, xmm3);
1906 */
1907 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1908 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1909 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1910
1911 /*
1912 * x = _mm_add_ps(x, xmm1);
1913 * x = _mm_add_ps(x, xmm2);
1914 * x = _mm_add_ps(x, xmm3);
1915 */
1916
1917 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1918 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1919 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1920
1921 /*
1922 * Evaluate the first polynom (0 <= x <= Pi/4)
1923 *
1924 * z = _mm_mul_ps(x,x);
1925 */
1926 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1927
1928 /*
1929 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1930 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1931 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1932 */
1933 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1934 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1935 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1936
1937 /*
1938 * y = *(v4sf*)_ps_coscof_p0;
1939 * y = _mm_mul_ps(y, z);
1940 */
1941 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1942 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1943 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1944 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1945 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1946 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1947
1948
1949 /*
1950 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1951 * y = _mm_sub_ps(y, tmp);
1952 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1953 */
1954 LLVMValueRef half = lp_build_const_v4sf(0.5);
1955 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1956 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1957 LLVMValueRef one = lp_build_const_v4sf(1.0);
1958 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1959
1960 /*
1961 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1962 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1963 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1964 */
1965 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1966 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1967 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1968
1969 /*
1970 * Evaluate the second polynom (Pi/4 <= x <= 0)
1971 *
1972 * y2 = *(v4sf*)_ps_sincof_p0;
1973 * y2 = _mm_mul_ps(y2, z);
1974 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1975 * y2 = _mm_mul_ps(y2, z);
1976 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1977 * y2 = _mm_mul_ps(y2, z);
1978 * y2 = _mm_mul_ps(y2, x);
1979 * y2 = _mm_add_ps(y2, x);
1980 */
1981
1982 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1983 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1984 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1985 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1986 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1987 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1988 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1989
1990 /*
1991 * select the correct result from the two polynoms
1992 * xmm3 = poly_mask;
1993 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1994 * y = _mm_andnot_ps(xmm3, y);
1995 * y = _mm_add_ps(y,y2);
1996 */
1997 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1998 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1999 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2000 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2001 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2002 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2003
2004 /*
2005 * update the sign
2006 * y = _mm_xor_ps(y, sign_bit);
2007 */
2008 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2009 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
2010 return y_result;
2011 }
2012
2013
2014 /**
2015 * Generate pow(x, y)
2016 */
2017 LLVMValueRef
2018 lp_build_pow(struct lp_build_context *bld,
2019 LLVMValueRef x,
2020 LLVMValueRef y)
2021 {
2022 /* TODO: optimize the constant case */
2023 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2024 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2025 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2026 __FUNCTION__);
2027 }
2028
2029 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2030 }
2031
2032
2033 /**
2034 * Generate exp(x)
2035 */
2036 LLVMValueRef
2037 lp_build_exp(struct lp_build_context *bld,
2038 LLVMValueRef x)
2039 {
2040 /* log2(e) = 1/log(2) */
2041 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
2042
2043 assert(lp_check_value(bld->type, x));
2044
2045 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
2046 }
2047
2048
2049 /**
2050 * Generate log(x)
2051 */
2052 LLVMValueRef
2053 lp_build_log(struct lp_build_context *bld,
2054 LLVMValueRef x)
2055 {
2056 /* log(2) */
2057 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
2058
2059 assert(lp_check_value(bld->type, x));
2060
2061 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
2062 }
2063
2064
2065 /**
2066 * Generate polynomial.
2067 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2068 */
2069 static LLVMValueRef
2070 lp_build_polynomial(struct lp_build_context *bld,
2071 LLVMValueRef x,
2072 const double *coeffs,
2073 unsigned num_coeffs)
2074 {
2075 const struct lp_type type = bld->type;
2076 LLVMValueRef res = NULL;
2077 unsigned i;
2078
2079 assert(lp_check_value(bld->type, x));
2080
2081 /* TODO: optimize the constant case */
2082 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2083 LLVMIsConstant(x)) {
2084 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2085 __FUNCTION__);
2086 }
2087
2088 for (i = num_coeffs; i--; ) {
2089 LLVMValueRef coeff;
2090
2091 coeff = lp_build_const_vec(type, coeffs[i]);
2092
2093 if(res)
2094 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2095 else
2096 res = coeff;
2097 }
2098
2099 if(res)
2100 return res;
2101 else
2102 return bld->undef;
2103 }
2104
2105
2106 /**
2107 * Minimax polynomial fit of 2**x, in range [0, 1[
2108 */
2109 const double lp_build_exp2_polynomial[] = {
2110 #if EXP_POLY_DEGREE == 5
2111 0.999999999690134838155,
2112 0.583974334321735217258,
2113 0.164553105719676828492,
2114 0.0292811063701710962255,
2115 0.00354944426657875141846,
2116 0.000296253726543423377365
2117 #elif EXP_POLY_DEGREE == 4
2118 1.00000001502262084505,
2119 0.563586057338685991394,
2120 0.150436017652442413623,
2121 0.0243220604213317927308,
2122 0.0025359088446580436489
2123 #elif EXP_POLY_DEGREE == 3
2124 0.999925218562710312959,
2125 0.695833540494823811697,
2126 0.226067155427249155588,
2127 0.0780245226406372992967
2128 #elif EXP_POLY_DEGREE == 2
2129 1.00172476321474503578,
2130 0.657636275736077639316,
2131 0.33718943461968720704
2132 #else
2133 #error
2134 #endif
2135 };
2136
2137
2138 void
2139 lp_build_exp2_approx(struct lp_build_context *bld,
2140 LLVMValueRef x,
2141 LLVMValueRef *p_exp2_int_part,
2142 LLVMValueRef *p_frac_part,
2143 LLVMValueRef *p_exp2)
2144 {
2145 const struct lp_type type = bld->type;
2146 LLVMTypeRef vec_type = lp_build_vec_type(type);
2147 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2148 LLVMValueRef ipart = NULL;
2149 LLVMValueRef fpart = NULL;
2150 LLVMValueRef expipart = NULL;
2151 LLVMValueRef expfpart = NULL;
2152 LLVMValueRef res = NULL;
2153
2154 assert(lp_check_value(bld->type, x));
2155
2156 if(p_exp2_int_part || p_frac_part || p_exp2) {
2157 /* TODO: optimize the constant case */
2158 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2159 LLVMIsConstant(x)) {
2160 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2161 __FUNCTION__);
2162 }
2163
2164 assert(type.floating && type.width == 32);
2165
2166 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
2167 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
2168
2169 /* ipart = floor(x) */
2170 ipart = lp_build_floor(bld, x);
2171
2172 /* fpart = x - ipart */
2173 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
2174 }
2175
2176 if(p_exp2_int_part || p_exp2) {
2177 /* expipart = (float) (1 << ipart) */
2178 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
2179 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2180 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
2181 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
2182 }
2183
2184 if(p_exp2) {
2185 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2186 Elements(lp_build_exp2_polynomial));
2187
2188 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2189 }
2190
2191 if(p_exp2_int_part)
2192 *p_exp2_int_part = expipart;
2193
2194 if(p_frac_part)
2195 *p_frac_part = fpart;
2196
2197 if(p_exp2)
2198 *p_exp2 = res;
2199 }
2200
2201
2202 LLVMValueRef
2203 lp_build_exp2(struct lp_build_context *bld,
2204 LLVMValueRef x)
2205 {
2206 LLVMValueRef res;
2207 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2208 return res;
2209 }
2210
2211
2212 /**
2213 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2214 * These coefficients can be generate with
2215 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2216 */
2217 const double lp_build_log2_polynomial[] = {
2218 #if LOG_POLY_DEGREE == 6
2219 3.11578814719469302614,
2220 -3.32419399085241980044,
2221 2.59883907202499966007,
2222 -1.23152682416275988241,
2223 0.318212422185251071475,
2224 -0.0344359067839062357313
2225 #elif LOG_POLY_DEGREE == 5
2226 2.8882704548164776201,
2227 -2.52074962577807006663,
2228 1.48116647521213171641,
2229 -0.465725644288844778798,
2230 0.0596515482674574969533
2231 #elif LOG_POLY_DEGREE == 4
2232 2.61761038894603480148,
2233 -1.75647175389045657003,
2234 0.688243882994381274313,
2235 -0.107254423828329604454
2236 #elif LOG_POLY_DEGREE == 3
2237 2.28330284476918490682,
2238 -1.04913055217340124191,
2239 0.204446009836232697516
2240 #else
2241 #error
2242 #endif
2243 };
2244
2245
2246 /**
2247 * See http://www.devmaster.net/forums/showthread.php?p=43580
2248 */
2249 void
2250 lp_build_log2_approx(struct lp_build_context *bld,
2251 LLVMValueRef x,
2252 LLVMValueRef *p_exp,
2253 LLVMValueRef *p_floor_log2,
2254 LLVMValueRef *p_log2)
2255 {
2256 const struct lp_type type = bld->type;
2257 LLVMTypeRef vec_type = lp_build_vec_type(type);
2258 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2259
2260 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2261 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2262 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2263
2264 LLVMValueRef i = NULL;
2265 LLVMValueRef exp = NULL;
2266 LLVMValueRef mant = NULL;
2267 LLVMValueRef logexp = NULL;
2268 LLVMValueRef logmant = NULL;
2269 LLVMValueRef res = NULL;
2270
2271 assert(lp_check_value(bld->type, x));
2272
2273 if(p_exp || p_floor_log2 || p_log2) {
2274 /* TODO: optimize the constant case */
2275 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2276 LLVMIsConstant(x)) {
2277 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2278 __FUNCTION__);
2279 }
2280
2281 assert(type.floating && type.width == 32);
2282
2283 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2284
2285 /* exp = (float) exponent(x) */
2286 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2287 }
2288
2289 if(p_floor_log2 || p_log2) {
2290 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2291 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2292 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2293 }
2294
2295 if(p_log2) {
2296 /* mant = (float) mantissa(x) */
2297 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2298 mant = LLVMBuildOr(bld->builder, mant, one, "");
2299 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2300
2301 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2302 Elements(lp_build_log2_polynomial));
2303
2304 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2305 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2306
2307 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2308 }
2309
2310 if(p_exp) {
2311 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2312 *p_exp = exp;
2313 }
2314
2315 if(p_floor_log2)
2316 *p_floor_log2 = logexp;
2317
2318 if(p_log2)
2319 *p_log2 = res;
2320 }
2321
2322
2323 LLVMValueRef
2324 lp_build_log2(struct lp_build_context *bld,
2325 LLVMValueRef x)
2326 {
2327 LLVMValueRef res;
2328 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2329 return res;
2330 }
2331
2332
2333 /**
2334 * Faster (and less accurate) log2.
2335 *
2336 * log2(x) = floor(log2(x)) + frac(x)
2337 *
2338 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2339 */
2340 LLVMValueRef
2341 lp_build_fast_log2(struct lp_build_context *bld,
2342 LLVMValueRef x)
2343 {
2344 const struct lp_type type = bld->type;
2345 LLVMTypeRef vec_type = bld->vec_type;
2346 LLVMTypeRef int_vec_type = bld->int_vec_type;
2347
2348 unsigned mantissa = lp_mantissa(type);
2349 LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
2350 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2351
2352 LLVMValueRef ipart;
2353 LLVMValueRef fpart;
2354
2355 assert(lp_check_value(bld->type, x));
2356
2357 assert(type.floating);
2358
2359 x = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2360
2361 /* ipart = floor(log2(x)) - 1 */
2362 ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
2363 ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), "");
2364 ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 128), "");
2365 ipart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
2366
2367 /* fpart = 1.0 + frac(x) */
2368 fpart = LLVMBuildAnd(bld->builder, x, mantmask, "");
2369 fpart = LLVMBuildOr(bld->builder, fpart, one, "");
2370 fpart = LLVMBuildBitCast(bld->builder, fpart, vec_type, "");
2371
2372 /* floor(log2(x)) + frac(x) */
2373 return LLVMBuildFAdd(bld->builder, ipart, fpart, "");
2374 }
2375
2376
2377 /**
2378 * Fast implementation of iround(log2(x)).
2379 *
2380 * Not an approximation -- it should give accurate results all the time.
2381 */
2382 LLVMValueRef
2383 lp_build_ilog2(struct lp_build_context *bld,
2384 LLVMValueRef x)
2385 {
2386 const struct lp_type type = bld->type;
2387 LLVMTypeRef int_vec_type = bld->int_vec_type;
2388
2389 unsigned mantissa = lp_mantissa(type);
2390 LLVMValueRef sqrt2 = lp_build_const_vec(type, 1.4142135623730951);
2391
2392 LLVMValueRef ipart;
2393
2394 assert(lp_check_value(bld->type, x));
2395
2396 assert(type.floating);
2397
2398 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
2399 x = LLVMBuildFMul(bld->builder, x, sqrt2, "");
2400
2401 x = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2402
2403 /* ipart = floor(log2(x) + 0.5) */
2404 ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
2405 ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), "");
2406 ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2407
2408 return ipart;
2409 }