util: remove util_is_pot in favor of util_is_power_of_two
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
60
61
62 #define EXP_POLY_DEGREE 3
63
64 #define LOG_POLY_DEGREE 5
65
66
67 /**
68 * Generate min(a, b)
69 * No checks for special case values of a or b = 1 or 0 are done.
70 */
71 static LLVMValueRef
72 lp_build_min_simple(struct lp_build_context *bld,
73 LLVMValueRef a,
74 LLVMValueRef b)
75 {
76 const struct lp_type type = bld->type;
77 const char *intrinsic = NULL;
78 LLVMValueRef cond;
79
80 assert(lp_check_value(type, a));
81 assert(lp_check_value(type, b));
82
83 /* TODO: optimize the constant case */
84
85 if(type.width * type.length == 128) {
86 if(type.floating) {
87 if(type.width == 32 && util_cpu_caps.has_sse)
88 intrinsic = "llvm.x86.sse.min.ps";
89 if(type.width == 64 && util_cpu_caps.has_sse2)
90 intrinsic = "llvm.x86.sse2.min.pd";
91 }
92 else {
93 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
94 intrinsic = "llvm.x86.sse2.pminu.b";
95 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
96 intrinsic = "llvm.x86.sse41.pminsb";
97 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
98 intrinsic = "llvm.x86.sse41.pminuw";
99 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
100 intrinsic = "llvm.x86.sse2.pmins.w";
101 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
102 intrinsic = "llvm.x86.sse41.pminud";
103 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
104 intrinsic = "llvm.x86.sse41.pminsd";
105 }
106 }
107
108 if(intrinsic)
109 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
110
111 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
112 return lp_build_select(bld, cond, a, b);
113 }
114
115
116 /**
117 * Generate max(a, b)
118 * No checks for special case values of a or b = 1 or 0 are done.
119 */
120 static LLVMValueRef
121 lp_build_max_simple(struct lp_build_context *bld,
122 LLVMValueRef a,
123 LLVMValueRef b)
124 {
125 const struct lp_type type = bld->type;
126 const char *intrinsic = NULL;
127 LLVMValueRef cond;
128
129 assert(lp_check_value(type, a));
130 assert(lp_check_value(type, b));
131
132 /* TODO: optimize the constant case */
133
134 if(type.width * type.length == 128) {
135 if(type.floating) {
136 if(type.width == 32 && util_cpu_caps.has_sse)
137 intrinsic = "llvm.x86.sse.max.ps";
138 if(type.width == 64 && util_cpu_caps.has_sse2)
139 intrinsic = "llvm.x86.sse2.max.pd";
140 }
141 else {
142 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
143 intrinsic = "llvm.x86.sse2.pmaxu.b";
144 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
145 intrinsic = "llvm.x86.sse41.pmaxsb";
146 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
147 intrinsic = "llvm.x86.sse41.pmaxuw";
148 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
149 intrinsic = "llvm.x86.sse2.pmaxs.w";
150 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
151 intrinsic = "llvm.x86.sse41.pmaxud";
152 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
153 intrinsic = "llvm.x86.sse41.pmaxsd";
154 }
155 }
156
157 if(intrinsic)
158 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
159
160 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
161 return lp_build_select(bld, cond, a, b);
162 }
163
164
165 /**
166 * Generate 1 - a, or ~a depending on bld->type.
167 */
168 LLVMValueRef
169 lp_build_comp(struct lp_build_context *bld,
170 LLVMValueRef a)
171 {
172 const struct lp_type type = bld->type;
173
174 assert(lp_check_value(type, a));
175
176 if(a == bld->one)
177 return bld->zero;
178 if(a == bld->zero)
179 return bld->one;
180
181 if(type.norm && !type.floating && !type.fixed && !type.sign) {
182 if(LLVMIsConstant(a))
183 return LLVMConstNot(a);
184 else
185 return LLVMBuildNot(bld->builder, a, "");
186 }
187
188 if(LLVMIsConstant(a))
189 if (type.floating)
190 return LLVMConstFSub(bld->one, a);
191 else
192 return LLVMConstSub(bld->one, a);
193 else
194 if (type.floating)
195 return LLVMBuildFSub(bld->builder, bld->one, a, "");
196 else
197 return LLVMBuildSub(bld->builder, bld->one, a, "");
198 }
199
200
201 /**
202 * Generate a + b
203 */
204 LLVMValueRef
205 lp_build_add(struct lp_build_context *bld,
206 LLVMValueRef a,
207 LLVMValueRef b)
208 {
209 const struct lp_type type = bld->type;
210 LLVMValueRef res;
211
212 assert(lp_check_value(type, a));
213 assert(lp_check_value(type, b));
214
215 if(a == bld->zero)
216 return b;
217 if(b == bld->zero)
218 return a;
219 if(a == bld->undef || b == bld->undef)
220 return bld->undef;
221
222 if(bld->type.norm) {
223 const char *intrinsic = NULL;
224
225 if(a == bld->one || b == bld->one)
226 return bld->one;
227
228 if(util_cpu_caps.has_sse2 &&
229 type.width * type.length == 128 &&
230 !type.floating && !type.fixed) {
231 if(type.width == 8)
232 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
233 if(type.width == 16)
234 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
235 }
236
237 if(intrinsic)
238 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
239 }
240
241 if(LLVMIsConstant(a) && LLVMIsConstant(b))
242 if (type.floating)
243 res = LLVMConstFAdd(a, b);
244 else
245 res = LLVMConstAdd(a, b);
246 else
247 if (type.floating)
248 res = LLVMBuildFAdd(bld->builder, a, b, "");
249 else
250 res = LLVMBuildAdd(bld->builder, a, b, "");
251
252 /* clamp to ceiling of 1.0 */
253 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
254 res = lp_build_min_simple(bld, res, bld->one);
255
256 /* XXX clamp to floor of -1 or 0??? */
257
258 return res;
259 }
260
261
262 /** Return the sum of the elements of a */
263 LLVMValueRef
264 lp_build_sum_vector(struct lp_build_context *bld,
265 LLVMValueRef a)
266 {
267 const struct lp_type type = bld->type;
268 LLVMValueRef index, res;
269 unsigned i;
270
271 assert(lp_check_value(type, a));
272
273 if (a == bld->zero)
274 return bld->zero;
275 if (a == bld->undef)
276 return bld->undef;
277 assert(type.length > 1);
278
279 assert(!bld->type.norm);
280
281 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
282 res = LLVMBuildExtractElement(bld->builder, a, index, "");
283
284 for (i = 1; i < type.length; i++) {
285 index = LLVMConstInt(LLVMInt32Type(), i, 0);
286 if (type.floating)
287 res = LLVMBuildFAdd(bld->builder, res,
288 LLVMBuildExtractElement(bld->builder,
289 a, index, ""),
290 "");
291 else
292 res = LLVMBuildAdd(bld->builder, res,
293 LLVMBuildExtractElement(bld->builder,
294 a, index, ""),
295 "");
296 }
297
298 return res;
299 }
300
301
302 /**
303 * Generate a - b
304 */
305 LLVMValueRef
306 lp_build_sub(struct lp_build_context *bld,
307 LLVMValueRef a,
308 LLVMValueRef b)
309 {
310 const struct lp_type type = bld->type;
311 LLVMValueRef res;
312
313 assert(lp_check_value(type, a));
314 assert(lp_check_value(type, b));
315
316 if(b == bld->zero)
317 return a;
318 if(a == bld->undef || b == bld->undef)
319 return bld->undef;
320 if(a == b)
321 return bld->zero;
322
323 if(bld->type.norm) {
324 const char *intrinsic = NULL;
325
326 if(b == bld->one)
327 return bld->zero;
328
329 if(util_cpu_caps.has_sse2 &&
330 type.width * type.length == 128 &&
331 !type.floating && !type.fixed) {
332 if(type.width == 8)
333 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
334 if(type.width == 16)
335 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
336 }
337
338 if(intrinsic)
339 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
340 }
341
342 if(LLVMIsConstant(a) && LLVMIsConstant(b))
343 if (type.floating)
344 res = LLVMConstFSub(a, b);
345 else
346 res = LLVMConstSub(a, b);
347 else
348 if (type.floating)
349 res = LLVMBuildFSub(bld->builder, a, b, "");
350 else
351 res = LLVMBuildSub(bld->builder, a, b, "");
352
353 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
354 res = lp_build_max_simple(bld, res, bld->zero);
355
356 return res;
357 }
358
359
360 /**
361 * Normalized 8bit multiplication.
362 *
363 * - alpha plus one
364 *
365 * makes the following approximation to the division (Sree)
366 *
367 * a*b/255 ~= (a*(b + 1)) >> 256
368 *
369 * which is the fastest method that satisfies the following OpenGL criteria
370 *
371 * 0*0 = 0 and 255*255 = 255
372 *
373 * - geometric series
374 *
375 * takes the geometric series approximation to the division
376 *
377 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
378 *
379 * in this case just the first two terms to fit in 16bit arithmetic
380 *
381 * t/255 ~= (t + (t >> 8)) >> 8
382 *
383 * note that just by itself it doesn't satisfies the OpenGL criteria, as
384 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
385 * must be used
386 *
387 * - geometric series plus rounding
388 *
389 * when using a geometric series division instead of truncating the result
390 * use roundoff in the approximation (Jim Blinn)
391 *
392 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
393 *
394 * achieving the exact results
395 *
396 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
397 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
398 * @sa Michael Herf, The "double blend trick", May 2000,
399 * http://www.stereopsis.com/doubleblend.html
400 */
401 static LLVMValueRef
402 lp_build_mul_u8n(LLVMBuilderRef builder,
403 struct lp_type i16_type,
404 LLVMValueRef a, LLVMValueRef b)
405 {
406 LLVMValueRef c8;
407 LLVMValueRef ab;
408
409 assert(!i16_type.floating);
410 assert(lp_check_value(i16_type, a));
411 assert(lp_check_value(i16_type, b));
412
413 c8 = lp_build_const_int_vec(i16_type, 8);
414
415 #if 0
416
417 /* a*b/255 ~= (a*(b + 1)) >> 256 */
418 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
419 ab = LLVMBuildMul(builder, a, b, "");
420
421 #else
422
423 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
424 ab = LLVMBuildMul(builder, a, b, "");
425 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
426 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
427
428 #endif
429
430 ab = LLVMBuildLShr(builder, ab, c8, "");
431
432 return ab;
433 }
434
435
436 /**
437 * Generate a * b
438 */
439 LLVMValueRef
440 lp_build_mul(struct lp_build_context *bld,
441 LLVMValueRef a,
442 LLVMValueRef b)
443 {
444 const struct lp_type type = bld->type;
445 LLVMValueRef shift;
446 LLVMValueRef res;
447
448 assert(lp_check_value(type, a));
449 assert(lp_check_value(type, b));
450
451 if(a == bld->zero)
452 return bld->zero;
453 if(a == bld->one)
454 return b;
455 if(b == bld->zero)
456 return bld->zero;
457 if(b == bld->one)
458 return a;
459 if(a == bld->undef || b == bld->undef)
460 return bld->undef;
461
462 if(!type.floating && !type.fixed && type.norm) {
463 if(type.width == 8) {
464 struct lp_type i16_type = lp_wider_type(type);
465 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
466
467 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
468 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
469
470 /* PMULLW, PSRLW, PADDW */
471 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
472 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
473
474 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
475
476 return ab;
477 }
478
479 /* FIXME */
480 assert(0);
481 }
482
483 if(type.fixed)
484 shift = lp_build_const_int_vec(type, type.width/2);
485 else
486 shift = NULL;
487
488 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
489 if (type.floating)
490 res = LLVMConstFMul(a, b);
491 else
492 res = LLVMConstMul(a, b);
493 if(shift) {
494 if(type.sign)
495 res = LLVMConstAShr(res, shift);
496 else
497 res = LLVMConstLShr(res, shift);
498 }
499 }
500 else {
501 if (type.floating)
502 res = LLVMBuildFMul(bld->builder, a, b, "");
503 else
504 res = LLVMBuildMul(bld->builder, a, b, "");
505 if(shift) {
506 if(type.sign)
507 res = LLVMBuildAShr(bld->builder, res, shift, "");
508 else
509 res = LLVMBuildLShr(bld->builder, res, shift, "");
510 }
511 }
512
513 return res;
514 }
515
516
517 /**
518 * Small vector x scale multiplication optimization.
519 */
520 LLVMValueRef
521 lp_build_mul_imm(struct lp_build_context *bld,
522 LLVMValueRef a,
523 int b)
524 {
525 LLVMValueRef factor;
526
527 assert(lp_check_value(bld->type, a));
528
529 if(b == 0)
530 return bld->zero;
531
532 if(b == 1)
533 return a;
534
535 if(b == -1)
536 return lp_build_negate(bld, a);
537
538 if(b == 2 && bld->type.floating)
539 return lp_build_add(bld, a, a);
540
541 if(util_is_power_of_two(b)) {
542 unsigned shift = ffs(b) - 1;
543
544 if(bld->type.floating) {
545 #if 0
546 /*
547 * Power of two multiplication by directly manipulating the mantissa.
548 *
549 * XXX: This might not be always faster, it will introduce a small error
550 * for multiplication by zero, and it will produce wrong results
551 * for Inf and NaN.
552 */
553 unsigned mantissa = lp_mantissa(bld->type);
554 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
555 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
556 a = LLVMBuildAdd(bld->builder, a, factor, "");
557 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
558 return a;
559 #endif
560 }
561 else {
562 factor = lp_build_const_vec(bld->type, shift);
563 return LLVMBuildShl(bld->builder, a, factor, "");
564 }
565 }
566
567 factor = lp_build_const_vec(bld->type, (double)b);
568 return lp_build_mul(bld, a, factor);
569 }
570
571
572 /**
573 * Generate a / b
574 */
575 LLVMValueRef
576 lp_build_div(struct lp_build_context *bld,
577 LLVMValueRef a,
578 LLVMValueRef b)
579 {
580 const struct lp_type type = bld->type;
581
582 assert(lp_check_value(type, a));
583 assert(lp_check_value(type, b));
584
585 if(a == bld->zero)
586 return bld->zero;
587 if(a == bld->one)
588 return lp_build_rcp(bld, b);
589 if(b == bld->zero)
590 return bld->undef;
591 if(b == bld->one)
592 return a;
593 if(a == bld->undef || b == bld->undef)
594 return bld->undef;
595
596 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
597 if (type.floating)
598 return LLVMConstFDiv(a, b);
599 else if (type.sign)
600 return LLVMConstSDiv(a, b);
601 else
602 return LLVMConstUDiv(a, b);
603 }
604
605 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
606 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
607
608 if (type.floating)
609 return LLVMBuildFDiv(bld->builder, a, b, "");
610 else if (type.sign)
611 return LLVMBuildSDiv(bld->builder, a, b, "");
612 else
613 return LLVMBuildUDiv(bld->builder, a, b, "");
614 }
615
616
617 /**
618 * Linear interpolation.
619 *
620 * This also works for integer values with a few caveats.
621 *
622 * @sa http://www.stereopsis.com/doubleblend.html
623 */
624 LLVMValueRef
625 lp_build_lerp(struct lp_build_context *bld,
626 LLVMValueRef x,
627 LLVMValueRef v0,
628 LLVMValueRef v1)
629 {
630 LLVMValueRef delta;
631 LLVMValueRef res;
632
633 assert(lp_check_value(bld->type, x));
634 assert(lp_check_value(bld->type, v0));
635 assert(lp_check_value(bld->type, v1));
636
637 delta = lp_build_sub(bld, v1, v0);
638
639 res = lp_build_mul(bld, x, delta);
640
641 res = lp_build_add(bld, v0, res);
642
643 if(bld->type.fixed)
644 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
645 * but it will be wrong for other uses. Basically we need a more
646 * powerful lp_type, capable of further distinguishing the values
647 * interpretation from the value storage. */
648 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
649
650 return res;
651 }
652
653
654 LLVMValueRef
655 lp_build_lerp_2d(struct lp_build_context *bld,
656 LLVMValueRef x,
657 LLVMValueRef y,
658 LLVMValueRef v00,
659 LLVMValueRef v01,
660 LLVMValueRef v10,
661 LLVMValueRef v11)
662 {
663 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
664 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
665 return lp_build_lerp(bld, y, v0, v1);
666 }
667
668
669 /**
670 * Generate min(a, b)
671 * Do checks for special cases.
672 */
673 LLVMValueRef
674 lp_build_min(struct lp_build_context *bld,
675 LLVMValueRef a,
676 LLVMValueRef b)
677 {
678 assert(lp_check_value(bld->type, a));
679 assert(lp_check_value(bld->type, b));
680
681 if(a == bld->undef || b == bld->undef)
682 return bld->undef;
683
684 if(a == b)
685 return a;
686
687 if(bld->type.norm) {
688 if(a == bld->zero || b == bld->zero)
689 return bld->zero;
690 if(a == bld->one)
691 return b;
692 if(b == bld->one)
693 return a;
694 }
695
696 return lp_build_min_simple(bld, a, b);
697 }
698
699
700 /**
701 * Generate max(a, b)
702 * Do checks for special cases.
703 */
704 LLVMValueRef
705 lp_build_max(struct lp_build_context *bld,
706 LLVMValueRef a,
707 LLVMValueRef b)
708 {
709 assert(lp_check_value(bld->type, a));
710 assert(lp_check_value(bld->type, b));
711
712 if(a == bld->undef || b == bld->undef)
713 return bld->undef;
714
715 if(a == b)
716 return a;
717
718 if(bld->type.norm) {
719 if(a == bld->one || b == bld->one)
720 return bld->one;
721 if(a == bld->zero)
722 return b;
723 if(b == bld->zero)
724 return a;
725 }
726
727 return lp_build_max_simple(bld, a, b);
728 }
729
730
731 /**
732 * Generate clamp(a, min, max)
733 * Do checks for special cases.
734 */
735 LLVMValueRef
736 lp_build_clamp(struct lp_build_context *bld,
737 LLVMValueRef a,
738 LLVMValueRef min,
739 LLVMValueRef max)
740 {
741 assert(lp_check_value(bld->type, a));
742 assert(lp_check_value(bld->type, min));
743 assert(lp_check_value(bld->type, max));
744
745 a = lp_build_min(bld, a, max);
746 a = lp_build_max(bld, a, min);
747 return a;
748 }
749
750
751 /**
752 * Generate abs(a)
753 */
754 LLVMValueRef
755 lp_build_abs(struct lp_build_context *bld,
756 LLVMValueRef a)
757 {
758 const struct lp_type type = bld->type;
759 LLVMTypeRef vec_type = lp_build_vec_type(type);
760
761 assert(lp_check_value(type, a));
762
763 if(!type.sign)
764 return a;
765
766 if(type.floating) {
767 /* Mask out the sign bit */
768 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
769 unsigned long long absMask = ~(1ULL << (type.width - 1));
770 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
771 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
772 a = LLVMBuildAnd(bld->builder, a, mask, "");
773 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
774 return a;
775 }
776
777 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
778 switch(type.width) {
779 case 8:
780 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
781 case 16:
782 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
783 case 32:
784 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
785 }
786 }
787
788 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
789 }
790
791
792 LLVMValueRef
793 lp_build_negate(struct lp_build_context *bld,
794 LLVMValueRef a)
795 {
796 assert(lp_check_value(bld->type, a));
797
798 #if HAVE_LLVM >= 0x0207
799 if (bld->type.floating)
800 a = LLVMBuildFNeg(bld->builder, a, "");
801 else
802 #endif
803 a = LLVMBuildNeg(bld->builder, a, "");
804
805 return a;
806 }
807
808
809 /** Return -1, 0 or +1 depending on the sign of a */
810 LLVMValueRef
811 lp_build_sgn(struct lp_build_context *bld,
812 LLVMValueRef a)
813 {
814 const struct lp_type type = bld->type;
815 LLVMValueRef cond;
816 LLVMValueRef res;
817
818 assert(lp_check_value(type, a));
819
820 /* Handle non-zero case */
821 if(!type.sign) {
822 /* if not zero then sign must be positive */
823 res = bld->one;
824 }
825 else if(type.floating) {
826 LLVMTypeRef vec_type;
827 LLVMTypeRef int_type;
828 LLVMValueRef mask;
829 LLVMValueRef sign;
830 LLVMValueRef one;
831 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
832
833 int_type = lp_build_int_vec_type(type);
834 vec_type = lp_build_vec_type(type);
835 mask = lp_build_const_int_vec(type, maskBit);
836
837 /* Take the sign bit and add it to 1 constant */
838 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
839 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
840 one = LLVMConstBitCast(bld->one, int_type);
841 res = LLVMBuildOr(bld->builder, sign, one, "");
842 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
843 }
844 else
845 {
846 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
847 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
848 res = lp_build_select(bld, cond, bld->one, minus_one);
849 }
850
851 /* Handle zero */
852 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
853 res = lp_build_select(bld, cond, bld->zero, res);
854
855 return res;
856 }
857
858
859 /**
860 * Set the sign of float vector 'a' according to 'sign'.
861 * If sign==0, return abs(a).
862 * If sign==1, return -abs(a);
863 * Other values for sign produce undefined results.
864 */
865 LLVMValueRef
866 lp_build_set_sign(struct lp_build_context *bld,
867 LLVMValueRef a, LLVMValueRef sign)
868 {
869 const struct lp_type type = bld->type;
870 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
871 LLVMTypeRef vec_type = lp_build_vec_type(type);
872 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
873 LLVMValueRef mask = lp_build_const_int_vec(type,
874 ~((unsigned long long) 1 << (type.width - 1)));
875 LLVMValueRef val, res;
876
877 assert(type.floating);
878 assert(lp_check_value(type, a));
879
880 /* val = reinterpret_cast<int>(a) */
881 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
882 /* val = val & mask */
883 val = LLVMBuildAnd(bld->builder, val, mask, "");
884 /* sign = sign << shift */
885 sign = LLVMBuildShl(bld->builder, sign, shift, "");
886 /* res = val | sign */
887 res = LLVMBuildOr(bld->builder, val, sign, "");
888 /* res = reinterpret_cast<float>(res) */
889 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
890
891 return res;
892 }
893
894
895 /**
896 * Convert vector of (or scalar) int to vector of (or scalar) float.
897 */
898 LLVMValueRef
899 lp_build_int_to_float(struct lp_build_context *bld,
900 LLVMValueRef a)
901 {
902 const struct lp_type type = bld->type;
903 LLVMTypeRef vec_type = lp_build_vec_type(type);
904
905 assert(type.floating);
906
907 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
908 }
909
910
911
912 enum lp_build_round_sse41_mode
913 {
914 LP_BUILD_ROUND_SSE41_NEAREST = 0,
915 LP_BUILD_ROUND_SSE41_FLOOR = 1,
916 LP_BUILD_ROUND_SSE41_CEIL = 2,
917 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
918 };
919
920
921 static INLINE LLVMValueRef
922 lp_build_round_sse41(struct lp_build_context *bld,
923 LLVMValueRef a,
924 enum lp_build_round_sse41_mode mode)
925 {
926 const struct lp_type type = bld->type;
927 LLVMTypeRef vec_type = lp_build_vec_type(type);
928 const char *intrinsic;
929
930 assert(type.floating);
931 assert(type.width*type.length == 128);
932 assert(lp_check_value(type, a));
933 assert(util_cpu_caps.has_sse4_1);
934
935 switch(type.width) {
936 case 32:
937 intrinsic = "llvm.x86.sse41.round.ps";
938 break;
939 case 64:
940 intrinsic = "llvm.x86.sse41.round.pd";
941 break;
942 default:
943 assert(0);
944 return bld->undef;
945 }
946
947 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
948 LLVMConstInt(LLVMInt32Type(), mode, 0));
949 }
950
951
952 /**
953 * Return the integer part of a float (vector) value. The returned value is
954 * a float (vector).
955 * Ex: trunc(-1.5) = 1.0
956 */
957 LLVMValueRef
958 lp_build_trunc(struct lp_build_context *bld,
959 LLVMValueRef a)
960 {
961 const struct lp_type type = bld->type;
962
963 assert(type.floating);
964 assert(lp_check_value(type, a));
965
966 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
967 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
968 else {
969 LLVMTypeRef vec_type = lp_build_vec_type(type);
970 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
971 LLVMValueRef res;
972 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
973 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
974 return res;
975 }
976 }
977
978
979 /**
980 * Return float (vector) rounded to nearest integer (vector). The returned
981 * value is a float (vector).
982 * Ex: round(0.9) = 1.0
983 * Ex: round(-1.5) = -2.0
984 */
985 LLVMValueRef
986 lp_build_round(struct lp_build_context *bld,
987 LLVMValueRef a)
988 {
989 const struct lp_type type = bld->type;
990
991 assert(type.floating);
992 assert(lp_check_value(type, a));
993
994 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
995 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
996 else {
997 LLVMTypeRef vec_type = lp_build_vec_type(type);
998 LLVMValueRef res;
999 res = lp_build_iround(bld, a);
1000 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1001 return res;
1002 }
1003 }
1004
1005
1006 /**
1007 * Return floor of float (vector), result is a float (vector)
1008 * Ex: floor(1.1) = 1.0
1009 * Ex: floor(-1.1) = -2.0
1010 */
1011 LLVMValueRef
1012 lp_build_floor(struct lp_build_context *bld,
1013 LLVMValueRef a)
1014 {
1015 const struct lp_type type = bld->type;
1016
1017 assert(type.floating);
1018 assert(lp_check_value(type, a));
1019
1020 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1021 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1022 else {
1023 LLVMTypeRef vec_type = lp_build_vec_type(type);
1024 LLVMValueRef res;
1025 res = lp_build_ifloor(bld, a);
1026 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1027 return res;
1028 }
1029 }
1030
1031
1032 /**
1033 * Return ceiling of float (vector), returning float (vector).
1034 * Ex: ceil( 1.1) = 2.0
1035 * Ex: ceil(-1.1) = -1.0
1036 */
1037 LLVMValueRef
1038 lp_build_ceil(struct lp_build_context *bld,
1039 LLVMValueRef a)
1040 {
1041 const struct lp_type type = bld->type;
1042
1043 assert(type.floating);
1044 assert(lp_check_value(type, a));
1045
1046 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1047 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1048 else {
1049 LLVMTypeRef vec_type = lp_build_vec_type(type);
1050 LLVMValueRef res;
1051 res = lp_build_iceil(bld, a);
1052 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1053 return res;
1054 }
1055 }
1056
1057
1058 /**
1059 * Return fractional part of 'a' computed as a - floor(a)
1060 * Typically used in texture coord arithmetic.
1061 */
1062 LLVMValueRef
1063 lp_build_fract(struct lp_build_context *bld,
1064 LLVMValueRef a)
1065 {
1066 assert(bld->type.floating);
1067 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1068 }
1069
1070
1071 /**
1072 * Return the integer part of a float (vector) value. The returned value is
1073 * an integer (vector).
1074 * Ex: itrunc(-1.5) = 1
1075 */
1076 LLVMValueRef
1077 lp_build_itrunc(struct lp_build_context *bld,
1078 LLVMValueRef a)
1079 {
1080 const struct lp_type type = bld->type;
1081 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1082
1083 assert(type.floating);
1084 assert(lp_check_value(type, a));
1085
1086 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1087 }
1088
1089
1090 /**
1091 * Return float (vector) rounded to nearest integer (vector). The returned
1092 * value is an integer (vector).
1093 * Ex: iround(0.9) = 1
1094 * Ex: iround(-1.5) = -2
1095 */
1096 LLVMValueRef
1097 lp_build_iround(struct lp_build_context *bld,
1098 LLVMValueRef a)
1099 {
1100 const struct lp_type type = bld->type;
1101 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1102 LLVMValueRef res;
1103
1104 assert(type.floating);
1105
1106 assert(lp_check_value(type, a));
1107
1108 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1109 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1110 }
1111 else {
1112 LLVMTypeRef vec_type = lp_build_vec_type(type);
1113 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1114 LLVMValueRef sign;
1115 LLVMValueRef half;
1116
1117 /* get sign bit */
1118 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1119 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1120
1121 /* sign * 0.5 */
1122 half = lp_build_const_vec(type, 0.5);
1123 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1124 half = LLVMBuildOr(bld->builder, sign, half, "");
1125 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1126
1127 res = LLVMBuildFAdd(bld->builder, a, half, "");
1128 }
1129
1130 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1131
1132 return res;
1133 }
1134
1135
1136 /**
1137 * Return floor of float (vector), result is an int (vector)
1138 * Ex: ifloor(1.1) = 1.0
1139 * Ex: ifloor(-1.1) = -2.0
1140 */
1141 LLVMValueRef
1142 lp_build_ifloor(struct lp_build_context *bld,
1143 LLVMValueRef a)
1144 {
1145 const struct lp_type type = bld->type;
1146 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1147 LLVMValueRef res;
1148
1149 assert(type.floating);
1150 assert(lp_check_value(type, a));
1151
1152 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1153 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1154 }
1155 else {
1156 /* Take the sign bit and add it to 1 constant */
1157 LLVMTypeRef vec_type = lp_build_vec_type(type);
1158 unsigned mantissa = lp_mantissa(type);
1159 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1160 LLVMValueRef sign;
1161 LLVMValueRef offset;
1162
1163 /* sign = a < 0 ? ~0 : 0 */
1164 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1165 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1166 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1167
1168 /* offset = -0.99999(9)f */
1169 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1170 offset = LLVMConstBitCast(offset, int_vec_type);
1171
1172 /* offset = a < 0 ? offset : 0.0f */
1173 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1174 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1175
1176 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1177 }
1178
1179 /* round to nearest (toward zero) */
1180 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1181
1182 return res;
1183 }
1184
1185
1186 /**
1187 * Return ceiling of float (vector), returning int (vector).
1188 * Ex: iceil( 1.1) = 2
1189 * Ex: iceil(-1.1) = -1
1190 */
1191 LLVMValueRef
1192 lp_build_iceil(struct lp_build_context *bld,
1193 LLVMValueRef a)
1194 {
1195 const struct lp_type type = bld->type;
1196 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1197 LLVMValueRef res;
1198
1199 assert(type.floating);
1200 assert(lp_check_value(type, a));
1201
1202 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1203 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1204 }
1205 else {
1206 LLVMTypeRef vec_type = lp_build_vec_type(type);
1207 unsigned mantissa = lp_mantissa(type);
1208 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1209 LLVMValueRef sign;
1210 LLVMValueRef offset;
1211
1212 /* sign = a < 0 ? 0 : ~0 */
1213 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1214 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1215 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1216 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1217
1218 /* offset = 0.99999(9)f */
1219 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1220 offset = LLVMConstBitCast(offset, int_vec_type);
1221
1222 /* offset = a < 0 ? 0.0 : offset */
1223 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1224 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1225
1226 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1227 }
1228
1229 /* round to nearest (toward zero) */
1230 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1231
1232 return res;
1233 }
1234
1235
1236 LLVMValueRef
1237 lp_build_sqrt(struct lp_build_context *bld,
1238 LLVMValueRef a)
1239 {
1240 const struct lp_type type = bld->type;
1241 LLVMTypeRef vec_type = lp_build_vec_type(type);
1242 char intrinsic[32];
1243
1244 assert(lp_check_value(type, a));
1245
1246 /* TODO: optimize the constant case */
1247 /* TODO: optimize the constant case */
1248
1249 assert(type.floating);
1250 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1251
1252 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1253 }
1254
1255
1256 /**
1257 * Do one Newton-Raphson step to improve reciprocate precision:
1258 *
1259 * x_{i+1} = x_i * (2 - a * x_i)
1260 *
1261 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1262 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1263 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1264 * halo. It would be necessary to clamp the argument to prevent this.
1265 *
1266 * See also:
1267 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1268 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1269 */
1270 static INLINE LLVMValueRef
1271 lp_build_rcp_refine(struct lp_build_context *bld,
1272 LLVMValueRef a,
1273 LLVMValueRef rcp_a)
1274 {
1275 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1276 LLVMValueRef res;
1277
1278 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1279 res = LLVMBuildFSub(bld->builder, two, res, "");
1280 res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
1281
1282 return res;
1283 }
1284
1285
1286 LLVMValueRef
1287 lp_build_rcp(struct lp_build_context *bld,
1288 LLVMValueRef a)
1289 {
1290 const struct lp_type type = bld->type;
1291
1292 assert(lp_check_value(type, a));
1293
1294 if(a == bld->zero)
1295 return bld->undef;
1296 if(a == bld->one)
1297 return bld->one;
1298 if(a == bld->undef)
1299 return bld->undef;
1300
1301 assert(type.floating);
1302
1303 if(LLVMIsConstant(a))
1304 return LLVMConstFDiv(bld->one, a);
1305
1306 /*
1307 * We don't use RCPPS because:
1308 * - it only has 10bits of precision
1309 * - it doesn't even get the reciprocate of 1.0 exactly
1310 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1311 * - for recent processors the benefit over DIVPS is marginal, a case
1312 * depedent
1313 *
1314 * We could still use it on certain processors if benchmarks show that the
1315 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1316 * particular uses that require less workarounds.
1317 */
1318
1319 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1320 const unsigned num_iterations = 0;
1321 LLVMValueRef res;
1322 unsigned i;
1323
1324 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1325
1326 for (i = 0; i < num_iterations; ++i) {
1327 res = lp_build_rcp_refine(bld, a, res);
1328 }
1329
1330 return res;
1331 }
1332
1333 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1334 }
1335
1336
1337 /**
1338 * Do one Newton-Raphson step to improve rsqrt precision:
1339 *
1340 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1341 *
1342 * See also:
1343 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1344 */
1345 static INLINE LLVMValueRef
1346 lp_build_rsqrt_refine(struct lp_build_context *bld,
1347 LLVMValueRef a,
1348 LLVMValueRef rsqrt_a)
1349 {
1350 LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
1351 LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
1352 LLVMValueRef res;
1353
1354 res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
1355 res = LLVMBuildFMul(bld->builder, a, res, "");
1356 res = LLVMBuildFSub(bld->builder, three, res, "");
1357 res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
1358 res = LLVMBuildFMul(bld->builder, half, res, "");
1359
1360 return res;
1361 }
1362
1363
1364 /**
1365 * Generate 1/sqrt(a)
1366 */
1367 LLVMValueRef
1368 lp_build_rsqrt(struct lp_build_context *bld,
1369 LLVMValueRef a)
1370 {
1371 const struct lp_type type = bld->type;
1372
1373 assert(lp_check_value(type, a));
1374
1375 assert(type.floating);
1376
1377 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1378 const unsigned num_iterations = 0;
1379 LLVMValueRef res;
1380 unsigned i;
1381
1382 res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1383
1384 for (i = 0; i < num_iterations; ++i) {
1385 res = lp_build_rsqrt_refine(bld, a, res);
1386 }
1387
1388 return res;
1389 }
1390
1391 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1392 }
1393
1394
1395 static inline LLVMValueRef
1396 lp_build_const_v4si(unsigned long value)
1397 {
1398 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1399 LLVMValueRef elements[4] = { element, element, element, element };
1400 return LLVMConstVector(elements, 4);
1401 }
1402
1403 static inline LLVMValueRef
1404 lp_build_const_v4sf(float value)
1405 {
1406 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1407 LLVMValueRef elements[4] = { element, element, element, element };
1408 return LLVMConstVector(elements, 4);
1409 }
1410
1411
1412 /**
1413 * Generate sin(a) using SSE2
1414 */
1415 LLVMValueRef
1416 lp_build_sin(struct lp_build_context *bld,
1417 LLVMValueRef a)
1418 {
1419 struct lp_type int_type = lp_int_type(bld->type);
1420 LLVMBuilderRef b = bld->builder;
1421 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1422 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1423
1424 /*
1425 * take the absolute value,
1426 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1427 */
1428
1429 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1430 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1431
1432 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1433 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1434
1435 /*
1436 * extract the sign bit (upper one)
1437 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1438 */
1439 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1440 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1441
1442 /*
1443 * scale by 4/Pi
1444 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1445 */
1446
1447 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1448 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1449
1450 /*
1451 * store the integer part of y in mm0
1452 * emm2 = _mm_cvttps_epi32(y);
1453 */
1454
1455 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1456
1457 /*
1458 * j=(j+1) & (~1) (see the cephes sources)
1459 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1460 */
1461
1462 LLVMValueRef all_one = lp_build_const_v4si(1);
1463 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1464 /*
1465 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1466 */
1467 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1468 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1469
1470 /*
1471 * y = _mm_cvtepi32_ps(emm2);
1472 */
1473 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1474
1475 /* get the swap sign flag
1476 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1477 */
1478 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1479 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1480
1481 /*
1482 * emm2 = _mm_slli_epi32(emm0, 29);
1483 */
1484 LLVMValueRef const_29 = lp_build_const_v4si(29);
1485 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1486
1487 /*
1488 * get the polynom selection mask
1489 * there is one polynom for 0 <= x <= Pi/4
1490 * and another one for Pi/4<x<=Pi/2
1491 * Both branches will be computed.
1492 *
1493 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1494 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1495 */
1496
1497 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1498 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1499 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1500 emm2_3, lp_build_const_v4si(0));
1501 /*
1502 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1503 */
1504 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1505
1506 /*
1507 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1508 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1509 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1510 */
1511 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1512 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1513 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1514
1515 /*
1516 * The magic pass: "Extended precision modular arithmetic"
1517 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1518 * xmm1 = _mm_mul_ps(y, xmm1);
1519 * xmm2 = _mm_mul_ps(y, xmm2);
1520 * xmm3 = _mm_mul_ps(y, xmm3);
1521 */
1522 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1523 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1524 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1525
1526 /*
1527 * x = _mm_add_ps(x, xmm1);
1528 * x = _mm_add_ps(x, xmm2);
1529 * x = _mm_add_ps(x, xmm3);
1530 */
1531
1532 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1533 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1534 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1535
1536 /*
1537 * Evaluate the first polynom (0 <= x <= Pi/4)
1538 *
1539 * z = _mm_mul_ps(x,x);
1540 */
1541 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1542
1543 /*
1544 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1545 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1546 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1547 */
1548 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1549 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1550 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1551
1552 /*
1553 * y = *(v4sf*)_ps_coscof_p0;
1554 * y = _mm_mul_ps(y, z);
1555 */
1556 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1557 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1558 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1559 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1560 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1561 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1562
1563
1564 /*
1565 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1566 * y = _mm_sub_ps(y, tmp);
1567 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1568 */
1569 LLVMValueRef half = lp_build_const_v4sf(0.5);
1570 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1571 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1572 LLVMValueRef one = lp_build_const_v4sf(1.0);
1573 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1574
1575 /*
1576 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1577 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1578 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1579 */
1580 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1581 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1582 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1583
1584 /*
1585 * Evaluate the second polynom (Pi/4 <= x <= 0)
1586 *
1587 * y2 = *(v4sf*)_ps_sincof_p0;
1588 * y2 = _mm_mul_ps(y2, z);
1589 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1590 * y2 = _mm_mul_ps(y2, z);
1591 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1592 * y2 = _mm_mul_ps(y2, z);
1593 * y2 = _mm_mul_ps(y2, x);
1594 * y2 = _mm_add_ps(y2, x);
1595 */
1596
1597 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1598 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1599 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1600 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1601 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1602 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1603 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1604
1605 /*
1606 * select the correct result from the two polynoms
1607 * xmm3 = poly_mask;
1608 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1609 * y = _mm_andnot_ps(xmm3, y);
1610 * y = _mm_add_ps(y,y2);
1611 */
1612 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1613 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1614 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1615 LLVMValueRef inv = lp_build_const_v4si(~0);
1616 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1617 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1618 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1619
1620 /*
1621 * update the sign
1622 * y = _mm_xor_ps(y, sign_bit);
1623 */
1624 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1625 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1626 return y_result;
1627 }
1628
1629
1630 /**
1631 * Generate cos(a) using SSE2
1632 */
1633 LLVMValueRef
1634 lp_build_cos(struct lp_build_context *bld,
1635 LLVMValueRef a)
1636 {
1637 struct lp_type int_type = lp_int_type(bld->type);
1638 LLVMBuilderRef b = bld->builder;
1639 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1640 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1641
1642 /*
1643 * take the absolute value,
1644 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1645 */
1646
1647 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1648 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1649
1650 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1651 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1652
1653 /*
1654 * scale by 4/Pi
1655 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1656 */
1657
1658 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1659 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1660
1661 /*
1662 * store the integer part of y in mm0
1663 * emm2 = _mm_cvttps_epi32(y);
1664 */
1665
1666 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1667
1668 /*
1669 * j=(j+1) & (~1) (see the cephes sources)
1670 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1671 */
1672
1673 LLVMValueRef all_one = lp_build_const_v4si(1);
1674 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1675 /*
1676 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1677 */
1678 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1679 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1680
1681 /*
1682 * y = _mm_cvtepi32_ps(emm2);
1683 */
1684 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1685
1686
1687 /*
1688 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1689 */
1690 LLVMValueRef const_2 = lp_build_const_v4si(2);
1691 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1692
1693
1694 /* get the swap sign flag
1695 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1696 */
1697 LLVMValueRef inv = lp_build_const_v4si(~0);
1698 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1699 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1700 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1701
1702 /*
1703 * emm2 = _mm_slli_epi32(emm0, 29);
1704 */
1705 LLVMValueRef const_29 = lp_build_const_v4si(29);
1706 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1707
1708 /*
1709 * get the polynom selection mask
1710 * there is one polynom for 0 <= x <= Pi/4
1711 * and another one for Pi/4<x<=Pi/2
1712 * Both branches will be computed.
1713 *
1714 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1715 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1716 */
1717
1718 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1719 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1720 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1721 emm2_3, lp_build_const_v4si(0));
1722
1723 /*
1724 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1725 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1726 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1727 */
1728 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1729 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1730 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1731
1732 /*
1733 * The magic pass: "Extended precision modular arithmetic"
1734 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1735 * xmm1 = _mm_mul_ps(y, xmm1);
1736 * xmm2 = _mm_mul_ps(y, xmm2);
1737 * xmm3 = _mm_mul_ps(y, xmm3);
1738 */
1739 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1740 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1741 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1742
1743 /*
1744 * x = _mm_add_ps(x, xmm1);
1745 * x = _mm_add_ps(x, xmm2);
1746 * x = _mm_add_ps(x, xmm3);
1747 */
1748
1749 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1750 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1751 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1752
1753 /*
1754 * Evaluate the first polynom (0 <= x <= Pi/4)
1755 *
1756 * z = _mm_mul_ps(x,x);
1757 */
1758 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1759
1760 /*
1761 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1762 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1763 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1764 */
1765 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1766 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1767 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1768
1769 /*
1770 * y = *(v4sf*)_ps_coscof_p0;
1771 * y = _mm_mul_ps(y, z);
1772 */
1773 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1774 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1775 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1776 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1777 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1778 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1779
1780
1781 /*
1782 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1783 * y = _mm_sub_ps(y, tmp);
1784 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1785 */
1786 LLVMValueRef half = lp_build_const_v4sf(0.5);
1787 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1788 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1789 LLVMValueRef one = lp_build_const_v4sf(1.0);
1790 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1791
1792 /*
1793 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1794 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1795 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1796 */
1797 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1798 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1799 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1800
1801 /*
1802 * Evaluate the second polynom (Pi/4 <= x <= 0)
1803 *
1804 * y2 = *(v4sf*)_ps_sincof_p0;
1805 * y2 = _mm_mul_ps(y2, z);
1806 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1807 * y2 = _mm_mul_ps(y2, z);
1808 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1809 * y2 = _mm_mul_ps(y2, z);
1810 * y2 = _mm_mul_ps(y2, x);
1811 * y2 = _mm_add_ps(y2, x);
1812 */
1813
1814 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1815 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1816 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1817 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1818 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1819 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1820 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1821
1822 /*
1823 * select the correct result from the two polynoms
1824 * xmm3 = poly_mask;
1825 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1826 * y = _mm_andnot_ps(xmm3, y);
1827 * y = _mm_add_ps(y,y2);
1828 */
1829 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1830 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1831 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1832 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1833 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1834 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1835
1836 /*
1837 * update the sign
1838 * y = _mm_xor_ps(y, sign_bit);
1839 */
1840 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1841 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1842 return y_result;
1843 }
1844
1845
1846 /**
1847 * Generate pow(x, y)
1848 */
1849 LLVMValueRef
1850 lp_build_pow(struct lp_build_context *bld,
1851 LLVMValueRef x,
1852 LLVMValueRef y)
1853 {
1854 /* TODO: optimize the constant case */
1855 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1856 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1857 __FUNCTION__);
1858
1859 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1860 }
1861
1862
1863 /**
1864 * Generate exp(x)
1865 */
1866 LLVMValueRef
1867 lp_build_exp(struct lp_build_context *bld,
1868 LLVMValueRef x)
1869 {
1870 /* log2(e) = 1/log(2) */
1871 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1872
1873 assert(lp_check_value(bld->type, x));
1874
1875 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1876 }
1877
1878
1879 /**
1880 * Generate log(x)
1881 */
1882 LLVMValueRef
1883 lp_build_log(struct lp_build_context *bld,
1884 LLVMValueRef x)
1885 {
1886 /* log(2) */
1887 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1888
1889 assert(lp_check_value(bld->type, x));
1890
1891 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1892 }
1893
1894
1895 /**
1896 * Generate polynomial.
1897 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1898 */
1899 static LLVMValueRef
1900 lp_build_polynomial(struct lp_build_context *bld,
1901 LLVMValueRef x,
1902 const double *coeffs,
1903 unsigned num_coeffs)
1904 {
1905 const struct lp_type type = bld->type;
1906 LLVMValueRef res = NULL;
1907 unsigned i;
1908
1909 assert(lp_check_value(bld->type, x));
1910
1911 /* TODO: optimize the constant case */
1912 if(LLVMIsConstant(x))
1913 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1914 __FUNCTION__);
1915
1916 for (i = num_coeffs; i--; ) {
1917 LLVMValueRef coeff;
1918
1919 coeff = lp_build_const_vec(type, coeffs[i]);
1920
1921 if(res)
1922 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1923 else
1924 res = coeff;
1925 }
1926
1927 if(res)
1928 return res;
1929 else
1930 return bld->undef;
1931 }
1932
1933
1934 /**
1935 * Minimax polynomial fit of 2**x, in range [0, 1[
1936 */
1937 const double lp_build_exp2_polynomial[] = {
1938 #if EXP_POLY_DEGREE == 5
1939 0.999999999690134838155,
1940 0.583974334321735217258,
1941 0.164553105719676828492,
1942 0.0292811063701710962255,
1943 0.00354944426657875141846,
1944 0.000296253726543423377365
1945 #elif EXP_POLY_DEGREE == 4
1946 1.00000001502262084505,
1947 0.563586057338685991394,
1948 0.150436017652442413623,
1949 0.0243220604213317927308,
1950 0.0025359088446580436489
1951 #elif EXP_POLY_DEGREE == 3
1952 0.999925218562710312959,
1953 0.695833540494823811697,
1954 0.226067155427249155588,
1955 0.0780245226406372992967
1956 #elif EXP_POLY_DEGREE == 2
1957 1.00172476321474503578,
1958 0.657636275736077639316,
1959 0.33718943461968720704
1960 #else
1961 #error
1962 #endif
1963 };
1964
1965
1966 void
1967 lp_build_exp2_approx(struct lp_build_context *bld,
1968 LLVMValueRef x,
1969 LLVMValueRef *p_exp2_int_part,
1970 LLVMValueRef *p_frac_part,
1971 LLVMValueRef *p_exp2)
1972 {
1973 const struct lp_type type = bld->type;
1974 LLVMTypeRef vec_type = lp_build_vec_type(type);
1975 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1976 LLVMValueRef ipart = NULL;
1977 LLVMValueRef fpart = NULL;
1978 LLVMValueRef expipart = NULL;
1979 LLVMValueRef expfpart = NULL;
1980 LLVMValueRef res = NULL;
1981
1982 assert(lp_check_value(bld->type, x));
1983
1984 if(p_exp2_int_part || p_frac_part || p_exp2) {
1985 /* TODO: optimize the constant case */
1986 if(LLVMIsConstant(x))
1987 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1988 __FUNCTION__);
1989
1990 assert(type.floating && type.width == 32);
1991
1992 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1993 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1994
1995 /* ipart = floor(x) */
1996 ipart = lp_build_floor(bld, x);
1997
1998 /* fpart = x - ipart */
1999 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
2000 }
2001
2002 if(p_exp2_int_part || p_exp2) {
2003 /* expipart = (float) (1 << ipart) */
2004 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
2005 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
2006 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
2007 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
2008 }
2009
2010 if(p_exp2) {
2011 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2012 Elements(lp_build_exp2_polynomial));
2013
2014 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
2015 }
2016
2017 if(p_exp2_int_part)
2018 *p_exp2_int_part = expipart;
2019
2020 if(p_frac_part)
2021 *p_frac_part = fpart;
2022
2023 if(p_exp2)
2024 *p_exp2 = res;
2025 }
2026
2027
2028 LLVMValueRef
2029 lp_build_exp2(struct lp_build_context *bld,
2030 LLVMValueRef x)
2031 {
2032 LLVMValueRef res;
2033 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2034 return res;
2035 }
2036
2037
2038 /**
2039 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2040 * These coefficients can be generate with
2041 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2042 */
2043 const double lp_build_log2_polynomial[] = {
2044 #if LOG_POLY_DEGREE == 6
2045 3.11578814719469302614,
2046 -3.32419399085241980044,
2047 2.59883907202499966007,
2048 -1.23152682416275988241,
2049 0.318212422185251071475,
2050 -0.0344359067839062357313
2051 #elif LOG_POLY_DEGREE == 5
2052 2.8882704548164776201,
2053 -2.52074962577807006663,
2054 1.48116647521213171641,
2055 -0.465725644288844778798,
2056 0.0596515482674574969533
2057 #elif LOG_POLY_DEGREE == 4
2058 2.61761038894603480148,
2059 -1.75647175389045657003,
2060 0.688243882994381274313,
2061 -0.107254423828329604454
2062 #elif LOG_POLY_DEGREE == 3
2063 2.28330284476918490682,
2064 -1.04913055217340124191,
2065 0.204446009836232697516
2066 #else
2067 #error
2068 #endif
2069 };
2070
2071
2072 /**
2073 * See http://www.devmaster.net/forums/showthread.php?p=43580
2074 */
2075 void
2076 lp_build_log2_approx(struct lp_build_context *bld,
2077 LLVMValueRef x,
2078 LLVMValueRef *p_exp,
2079 LLVMValueRef *p_floor_log2,
2080 LLVMValueRef *p_log2)
2081 {
2082 const struct lp_type type = bld->type;
2083 LLVMTypeRef vec_type = lp_build_vec_type(type);
2084 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2085
2086 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2087 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2088 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2089
2090 LLVMValueRef i = NULL;
2091 LLVMValueRef exp = NULL;
2092 LLVMValueRef mant = NULL;
2093 LLVMValueRef logexp = NULL;
2094 LLVMValueRef logmant = NULL;
2095 LLVMValueRef res = NULL;
2096
2097 assert(lp_check_value(bld->type, x));
2098
2099 if(p_exp || p_floor_log2 || p_log2) {
2100 /* TODO: optimize the constant case */
2101 if(LLVMIsConstant(x))
2102 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2103 __FUNCTION__);
2104
2105 assert(type.floating && type.width == 32);
2106
2107 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2108
2109 /* exp = (float) exponent(x) */
2110 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2111 }
2112
2113 if(p_floor_log2 || p_log2) {
2114 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2115 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2116 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2117 }
2118
2119 if(p_log2) {
2120 /* mant = (float) mantissa(x) */
2121 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2122 mant = LLVMBuildOr(bld->builder, mant, one, "");
2123 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2124
2125 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2126 Elements(lp_build_log2_polynomial));
2127
2128 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2129 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2130
2131 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2132 }
2133
2134 if(p_exp) {
2135 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2136 *p_exp = exp;
2137 }
2138
2139 if(p_floor_log2)
2140 *p_floor_log2 = logexp;
2141
2142 if(p_log2)
2143 *p_log2 = res;
2144 }
2145
2146
2147 LLVMValueRef
2148 lp_build_log2(struct lp_build_context *bld,
2149 LLVMValueRef x)
2150 {
2151 LLVMValueRef res;
2152 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2153 return res;
2154 }