gallivm: Fix and enable the extra Newton/Raphson step in lp_build_rcp().
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_intr.h"
57 #include "lp_bld_logic.h"
58 #include "lp_bld_pack.h"
59 #include "lp_bld_arit.h"
60
61
62 /**
63 * Generate min(a, b)
64 * No checks for special case values of a or b = 1 or 0 are done.
65 */
66 static LLVMValueRef
67 lp_build_min_simple(struct lp_build_context *bld,
68 LLVMValueRef a,
69 LLVMValueRef b)
70 {
71 const struct lp_type type = bld->type;
72 const char *intrinsic = NULL;
73 LLVMValueRef cond;
74
75 assert(lp_check_value(type, a));
76 assert(lp_check_value(type, b));
77
78 /* TODO: optimize the constant case */
79
80 if(type.width * type.length == 128) {
81 if(type.floating) {
82 if(type.width == 32 && util_cpu_caps.has_sse)
83 intrinsic = "llvm.x86.sse.min.ps";
84 if(type.width == 64 && util_cpu_caps.has_sse2)
85 intrinsic = "llvm.x86.sse2.min.pd";
86 }
87 else {
88 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
89 intrinsic = "llvm.x86.sse2.pminu.b";
90 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
91 intrinsic = "llvm.x86.sse41.pminsb";
92 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
93 intrinsic = "llvm.x86.sse41.pminuw";
94 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
95 intrinsic = "llvm.x86.sse2.pmins.w";
96 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
97 intrinsic = "llvm.x86.sse41.pminud";
98 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminsd";
100 }
101 }
102
103 if(intrinsic)
104 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
105
106 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
107 return lp_build_select(bld, cond, a, b);
108 }
109
110
111 /**
112 * Generate max(a, b)
113 * No checks for special case values of a or b = 1 or 0 are done.
114 */
115 static LLVMValueRef
116 lp_build_max_simple(struct lp_build_context *bld,
117 LLVMValueRef a,
118 LLVMValueRef b)
119 {
120 const struct lp_type type = bld->type;
121 const char *intrinsic = NULL;
122 LLVMValueRef cond;
123
124 assert(lp_check_value(type, a));
125 assert(lp_check_value(type, b));
126
127 /* TODO: optimize the constant case */
128
129 if(type.width * type.length == 128) {
130 if(type.floating) {
131 if(type.width == 32 && util_cpu_caps.has_sse)
132 intrinsic = "llvm.x86.sse.max.ps";
133 if(type.width == 64 && util_cpu_caps.has_sse2)
134 intrinsic = "llvm.x86.sse2.max.pd";
135 }
136 else {
137 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
138 intrinsic = "llvm.x86.sse2.pmaxu.b";
139 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
140 intrinsic = "llvm.x86.sse41.pmaxsb";
141 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
142 intrinsic = "llvm.x86.sse41.pmaxuw";
143 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
144 intrinsic = "llvm.x86.sse2.pmaxs.w";
145 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
146 intrinsic = "llvm.x86.sse41.pmaxud";
147 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
148 intrinsic = "llvm.x86.sse41.pmaxsd";
149 }
150 }
151
152 if(intrinsic)
153 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
154
155 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
156 return lp_build_select(bld, cond, a, b);
157 }
158
159
160 /**
161 * Generate 1 - a, or ~a depending on bld->type.
162 */
163 LLVMValueRef
164 lp_build_comp(struct lp_build_context *bld,
165 LLVMValueRef a)
166 {
167 const struct lp_type type = bld->type;
168
169 assert(lp_check_value(type, a));
170
171 if(a == bld->one)
172 return bld->zero;
173 if(a == bld->zero)
174 return bld->one;
175
176 if(type.norm && !type.floating && !type.fixed && !type.sign) {
177 if(LLVMIsConstant(a))
178 return LLVMConstNot(a);
179 else
180 return LLVMBuildNot(bld->builder, a, "");
181 }
182
183 if(LLVMIsConstant(a))
184 if (type.floating)
185 return LLVMConstFSub(bld->one, a);
186 else
187 return LLVMConstSub(bld->one, a);
188 else
189 if (type.floating)
190 return LLVMBuildFSub(bld->builder, bld->one, a, "");
191 else
192 return LLVMBuildSub(bld->builder, bld->one, a, "");
193 }
194
195
196 /**
197 * Generate a + b
198 */
199 LLVMValueRef
200 lp_build_add(struct lp_build_context *bld,
201 LLVMValueRef a,
202 LLVMValueRef b)
203 {
204 const struct lp_type type = bld->type;
205 LLVMValueRef res;
206
207 assert(lp_check_value(type, a));
208 assert(lp_check_value(type, b));
209
210 if(a == bld->zero)
211 return b;
212 if(b == bld->zero)
213 return a;
214 if(a == bld->undef || b == bld->undef)
215 return bld->undef;
216
217 if(bld->type.norm) {
218 const char *intrinsic = NULL;
219
220 if(a == bld->one || b == bld->one)
221 return bld->one;
222
223 if(util_cpu_caps.has_sse2 &&
224 type.width * type.length == 128 &&
225 !type.floating && !type.fixed) {
226 if(type.width == 8)
227 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
228 if(type.width == 16)
229 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
230 }
231
232 if(intrinsic)
233 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
234 }
235
236 if(LLVMIsConstant(a) && LLVMIsConstant(b))
237 if (type.floating)
238 res = LLVMConstFAdd(a, b);
239 else
240 res = LLVMConstAdd(a, b);
241 else
242 if (type.floating)
243 res = LLVMBuildFAdd(bld->builder, a, b, "");
244 else
245 res = LLVMBuildAdd(bld->builder, a, b, "");
246
247 /* clamp to ceiling of 1.0 */
248 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
249 res = lp_build_min_simple(bld, res, bld->one);
250
251 /* XXX clamp to floor of -1 or 0??? */
252
253 return res;
254 }
255
256
257 /** Return the sum of the elements of a */
258 LLVMValueRef
259 lp_build_sum_vector(struct lp_build_context *bld,
260 LLVMValueRef a)
261 {
262 const struct lp_type type = bld->type;
263 LLVMValueRef index, res;
264 unsigned i;
265
266 assert(lp_check_value(type, a));
267
268 if (a == bld->zero)
269 return bld->zero;
270 if (a == bld->undef)
271 return bld->undef;
272 assert(type.length > 1);
273
274 assert(!bld->type.norm);
275
276 index = LLVMConstInt(LLVMInt32Type(), 0, 0);
277 res = LLVMBuildExtractElement(bld->builder, a, index, "");
278
279 for (i = 1; i < type.length; i++) {
280 index = LLVMConstInt(LLVMInt32Type(), i, 0);
281 if (type.floating)
282 res = LLVMBuildFAdd(bld->builder, res,
283 LLVMBuildExtractElement(bld->builder,
284 a, index, ""),
285 "");
286 else
287 res = LLVMBuildAdd(bld->builder, res,
288 LLVMBuildExtractElement(bld->builder,
289 a, index, ""),
290 "");
291 }
292
293 return res;
294 }
295
296
297 /**
298 * Generate a - b
299 */
300 LLVMValueRef
301 lp_build_sub(struct lp_build_context *bld,
302 LLVMValueRef a,
303 LLVMValueRef b)
304 {
305 const struct lp_type type = bld->type;
306 LLVMValueRef res;
307
308 assert(lp_check_value(type, a));
309 assert(lp_check_value(type, b));
310
311 if(b == bld->zero)
312 return a;
313 if(a == bld->undef || b == bld->undef)
314 return bld->undef;
315 if(a == b)
316 return bld->zero;
317
318 if(bld->type.norm) {
319 const char *intrinsic = NULL;
320
321 if(b == bld->one)
322 return bld->zero;
323
324 if(util_cpu_caps.has_sse2 &&
325 type.width * type.length == 128 &&
326 !type.floating && !type.fixed) {
327 if(type.width == 8)
328 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
329 if(type.width == 16)
330 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
331 }
332
333 if(intrinsic)
334 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
335 }
336
337 if(LLVMIsConstant(a) && LLVMIsConstant(b))
338 if (type.floating)
339 res = LLVMConstFSub(a, b);
340 else
341 res = LLVMConstSub(a, b);
342 else
343 if (type.floating)
344 res = LLVMBuildFSub(bld->builder, a, b, "");
345 else
346 res = LLVMBuildSub(bld->builder, a, b, "");
347
348 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
349 res = lp_build_max_simple(bld, res, bld->zero);
350
351 return res;
352 }
353
354
355 /**
356 * Normalized 8bit multiplication.
357 *
358 * - alpha plus one
359 *
360 * makes the following approximation to the division (Sree)
361 *
362 * a*b/255 ~= (a*(b + 1)) >> 256
363 *
364 * which is the fastest method that satisfies the following OpenGL criteria
365 *
366 * 0*0 = 0 and 255*255 = 255
367 *
368 * - geometric series
369 *
370 * takes the geometric series approximation to the division
371 *
372 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
373 *
374 * in this case just the first two terms to fit in 16bit arithmetic
375 *
376 * t/255 ~= (t + (t >> 8)) >> 8
377 *
378 * note that just by itself it doesn't satisfies the OpenGL criteria, as
379 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
380 * must be used
381 *
382 * - geometric series plus rounding
383 *
384 * when using a geometric series division instead of truncating the result
385 * use roundoff in the approximation (Jim Blinn)
386 *
387 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
388 *
389 * achieving the exact results
390 *
391 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
392 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
393 * @sa Michael Herf, The "double blend trick", May 2000,
394 * http://www.stereopsis.com/doubleblend.html
395 */
396 static LLVMValueRef
397 lp_build_mul_u8n(LLVMBuilderRef builder,
398 struct lp_type i16_type,
399 LLVMValueRef a, LLVMValueRef b)
400 {
401 LLVMValueRef c8;
402 LLVMValueRef ab;
403
404 assert(!i16_type.floating);
405 assert(lp_check_value(i16_type, a));
406 assert(lp_check_value(i16_type, b));
407
408 c8 = lp_build_const_int_vec(i16_type, 8);
409
410 #if 0
411
412 /* a*b/255 ~= (a*(b + 1)) >> 256 */
413 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(i16_type, 1), "");
414 ab = LLVMBuildMul(builder, a, b, "");
415
416 #else
417
418 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
419 ab = LLVMBuildMul(builder, a, b, "");
420 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
421 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(i16_type, 0x80), "");
422
423 #endif
424
425 ab = LLVMBuildLShr(builder, ab, c8, "");
426
427 return ab;
428 }
429
430
431 /**
432 * Generate a * b
433 */
434 LLVMValueRef
435 lp_build_mul(struct lp_build_context *bld,
436 LLVMValueRef a,
437 LLVMValueRef b)
438 {
439 const struct lp_type type = bld->type;
440 LLVMValueRef shift;
441 LLVMValueRef res;
442
443 assert(lp_check_value(type, a));
444 assert(lp_check_value(type, b));
445
446 if(a == bld->zero)
447 return bld->zero;
448 if(a == bld->one)
449 return b;
450 if(b == bld->zero)
451 return bld->zero;
452 if(b == bld->one)
453 return a;
454 if(a == bld->undef || b == bld->undef)
455 return bld->undef;
456
457 if(!type.floating && !type.fixed && type.norm) {
458 if(type.width == 8) {
459 struct lp_type i16_type = lp_wider_type(type);
460 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
461
462 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
463 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
464
465 /* PMULLW, PSRLW, PADDW */
466 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
467 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
468
469 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
470
471 return ab;
472 }
473
474 /* FIXME */
475 assert(0);
476 }
477
478 if(type.fixed)
479 shift = lp_build_const_int_vec(type, type.width/2);
480 else
481 shift = NULL;
482
483 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
484 if (type.floating)
485 res = LLVMConstFMul(a, b);
486 else
487 res = LLVMConstMul(a, b);
488 if(shift) {
489 if(type.sign)
490 res = LLVMConstAShr(res, shift);
491 else
492 res = LLVMConstLShr(res, shift);
493 }
494 }
495 else {
496 if (type.floating)
497 res = LLVMBuildFMul(bld->builder, a, b, "");
498 else
499 res = LLVMBuildMul(bld->builder, a, b, "");
500 if(shift) {
501 if(type.sign)
502 res = LLVMBuildAShr(bld->builder, res, shift, "");
503 else
504 res = LLVMBuildLShr(bld->builder, res, shift, "");
505 }
506 }
507
508 return res;
509 }
510
511
512 /**
513 * Small vector x scale multiplication optimization.
514 */
515 LLVMValueRef
516 lp_build_mul_imm(struct lp_build_context *bld,
517 LLVMValueRef a,
518 int b)
519 {
520 LLVMValueRef factor;
521
522 assert(lp_check_value(bld->type, a));
523
524 if(b == 0)
525 return bld->zero;
526
527 if(b == 1)
528 return a;
529
530 if(b == -1)
531 return lp_build_negate(bld, a);
532
533 if(b == 2 && bld->type.floating)
534 return lp_build_add(bld, a, a);
535
536 if(util_is_pot(b)) {
537 unsigned shift = ffs(b) - 1;
538
539 if(bld->type.floating) {
540 #if 0
541 /*
542 * Power of two multiplication by directly manipulating the mantissa.
543 *
544 * XXX: This might not be always faster, it will introduce a small error
545 * for multiplication by zero, and it will produce wrong results
546 * for Inf and NaN.
547 */
548 unsigned mantissa = lp_mantissa(bld->type);
549 factor = lp_build_const_int_vec(bld->type, (unsigned long long)shift << mantissa);
550 a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
551 a = LLVMBuildAdd(bld->builder, a, factor, "");
552 a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
553 return a;
554 #endif
555 }
556 else {
557 factor = lp_build_const_vec(bld->type, shift);
558 return LLVMBuildShl(bld->builder, a, factor, "");
559 }
560 }
561
562 factor = lp_build_const_vec(bld->type, (double)b);
563 return lp_build_mul(bld, a, factor);
564 }
565
566
567 /**
568 * Generate a / b
569 */
570 LLVMValueRef
571 lp_build_div(struct lp_build_context *bld,
572 LLVMValueRef a,
573 LLVMValueRef b)
574 {
575 const struct lp_type type = bld->type;
576
577 assert(lp_check_value(type, a));
578 assert(lp_check_value(type, b));
579
580 if(a == bld->zero)
581 return bld->zero;
582 if(a == bld->one)
583 return lp_build_rcp(bld, b);
584 if(b == bld->zero)
585 return bld->undef;
586 if(b == bld->one)
587 return a;
588 if(a == bld->undef || b == bld->undef)
589 return bld->undef;
590
591 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
592 if (type.floating)
593 return LLVMConstFDiv(a, b);
594 else if (type.sign)
595 return LLVMConstSDiv(a, b);
596 else
597 return LLVMConstUDiv(a, b);
598 }
599
600 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
601 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
602
603 if (type.floating)
604 return LLVMBuildFDiv(bld->builder, a, b, "");
605 else if (type.sign)
606 return LLVMBuildSDiv(bld->builder, a, b, "");
607 else
608 return LLVMBuildUDiv(bld->builder, a, b, "");
609 }
610
611
612 /**
613 * Linear interpolation.
614 *
615 * This also works for integer values with a few caveats.
616 *
617 * @sa http://www.stereopsis.com/doubleblend.html
618 */
619 LLVMValueRef
620 lp_build_lerp(struct lp_build_context *bld,
621 LLVMValueRef x,
622 LLVMValueRef v0,
623 LLVMValueRef v1)
624 {
625 LLVMValueRef delta;
626 LLVMValueRef res;
627
628 assert(lp_check_value(bld->type, x));
629 assert(lp_check_value(bld->type, v0));
630 assert(lp_check_value(bld->type, v1));
631
632 delta = lp_build_sub(bld, v1, v0);
633
634 res = lp_build_mul(bld, x, delta);
635
636 res = lp_build_add(bld, v0, res);
637
638 if(bld->type.fixed)
639 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
640 * but it will be wrong for other uses. Basically we need a more
641 * powerful lp_type, capable of further distinguishing the values
642 * interpretation from the value storage. */
643 res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
644
645 return res;
646 }
647
648
649 LLVMValueRef
650 lp_build_lerp_2d(struct lp_build_context *bld,
651 LLVMValueRef x,
652 LLVMValueRef y,
653 LLVMValueRef v00,
654 LLVMValueRef v01,
655 LLVMValueRef v10,
656 LLVMValueRef v11)
657 {
658 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
659 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
660 return lp_build_lerp(bld, y, v0, v1);
661 }
662
663
664 /**
665 * Generate min(a, b)
666 * Do checks for special cases.
667 */
668 LLVMValueRef
669 lp_build_min(struct lp_build_context *bld,
670 LLVMValueRef a,
671 LLVMValueRef b)
672 {
673 assert(lp_check_value(bld->type, a));
674 assert(lp_check_value(bld->type, b));
675
676 if(a == bld->undef || b == bld->undef)
677 return bld->undef;
678
679 if(a == b)
680 return a;
681
682 if(bld->type.norm) {
683 if(a == bld->zero || b == bld->zero)
684 return bld->zero;
685 if(a == bld->one)
686 return b;
687 if(b == bld->one)
688 return a;
689 }
690
691 return lp_build_min_simple(bld, a, b);
692 }
693
694
695 /**
696 * Generate max(a, b)
697 * Do checks for special cases.
698 */
699 LLVMValueRef
700 lp_build_max(struct lp_build_context *bld,
701 LLVMValueRef a,
702 LLVMValueRef b)
703 {
704 assert(lp_check_value(bld->type, a));
705 assert(lp_check_value(bld->type, b));
706
707 if(a == bld->undef || b == bld->undef)
708 return bld->undef;
709
710 if(a == b)
711 return a;
712
713 if(bld->type.norm) {
714 if(a == bld->one || b == bld->one)
715 return bld->one;
716 if(a == bld->zero)
717 return b;
718 if(b == bld->zero)
719 return a;
720 }
721
722 return lp_build_max_simple(bld, a, b);
723 }
724
725
726 /**
727 * Generate clamp(a, min, max)
728 * Do checks for special cases.
729 */
730 LLVMValueRef
731 lp_build_clamp(struct lp_build_context *bld,
732 LLVMValueRef a,
733 LLVMValueRef min,
734 LLVMValueRef max)
735 {
736 assert(lp_check_value(bld->type, a));
737 assert(lp_check_value(bld->type, min));
738 assert(lp_check_value(bld->type, max));
739
740 a = lp_build_min(bld, a, max);
741 a = lp_build_max(bld, a, min);
742 return a;
743 }
744
745
746 /**
747 * Generate abs(a)
748 */
749 LLVMValueRef
750 lp_build_abs(struct lp_build_context *bld,
751 LLVMValueRef a)
752 {
753 const struct lp_type type = bld->type;
754 LLVMTypeRef vec_type = lp_build_vec_type(type);
755
756 assert(lp_check_value(type, a));
757
758 if(!type.sign)
759 return a;
760
761 if(type.floating) {
762 /* Mask out the sign bit */
763 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
764 unsigned long long absMask = ~(1ULL << (type.width - 1));
765 LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
766 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
767 a = LLVMBuildAnd(bld->builder, a, mask, "");
768 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
769 return a;
770 }
771
772 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
773 switch(type.width) {
774 case 8:
775 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
776 case 16:
777 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
778 case 32:
779 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
780 }
781 }
782
783 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
784 }
785
786
787 LLVMValueRef
788 lp_build_negate(struct lp_build_context *bld,
789 LLVMValueRef a)
790 {
791 assert(lp_check_value(bld->type, a));
792
793 #if HAVE_LLVM >= 0x0207
794 if (bld->type.floating)
795 a = LLVMBuildFNeg(bld->builder, a, "");
796 else
797 #endif
798 a = LLVMBuildNeg(bld->builder, a, "");
799
800 return a;
801 }
802
803
804 /** Return -1, 0 or +1 depending on the sign of a */
805 LLVMValueRef
806 lp_build_sgn(struct lp_build_context *bld,
807 LLVMValueRef a)
808 {
809 const struct lp_type type = bld->type;
810 LLVMValueRef cond;
811 LLVMValueRef res;
812
813 assert(lp_check_value(type, a));
814
815 /* Handle non-zero case */
816 if(!type.sign) {
817 /* if not zero then sign must be positive */
818 res = bld->one;
819 }
820 else if(type.floating) {
821 LLVMTypeRef vec_type;
822 LLVMTypeRef int_type;
823 LLVMValueRef mask;
824 LLVMValueRef sign;
825 LLVMValueRef one;
826 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
827
828 int_type = lp_build_int_vec_type(type);
829 vec_type = lp_build_vec_type(type);
830 mask = lp_build_const_int_vec(type, maskBit);
831
832 /* Take the sign bit and add it to 1 constant */
833 sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
834 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
835 one = LLVMConstBitCast(bld->one, int_type);
836 res = LLVMBuildOr(bld->builder, sign, one, "");
837 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
838 }
839 else
840 {
841 LLVMValueRef minus_one = lp_build_const_vec(type, -1.0);
842 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
843 res = lp_build_select(bld, cond, bld->one, minus_one);
844 }
845
846 /* Handle zero */
847 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
848 res = lp_build_select(bld, cond, bld->zero, res);
849
850 return res;
851 }
852
853
854 /**
855 * Set the sign of float vector 'a' according to 'sign'.
856 * If sign==0, return abs(a).
857 * If sign==1, return -abs(a);
858 * Other values for sign produce undefined results.
859 */
860 LLVMValueRef
861 lp_build_set_sign(struct lp_build_context *bld,
862 LLVMValueRef a, LLVMValueRef sign)
863 {
864 const struct lp_type type = bld->type;
865 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
866 LLVMTypeRef vec_type = lp_build_vec_type(type);
867 LLVMValueRef shift = lp_build_const_int_vec(type, type.width - 1);
868 LLVMValueRef mask = lp_build_const_int_vec(type,
869 ~((unsigned long long) 1 << (type.width - 1)));
870 LLVMValueRef val, res;
871
872 assert(type.floating);
873 assert(lp_check_value(type, a));
874
875 /* val = reinterpret_cast<int>(a) */
876 val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
877 /* val = val & mask */
878 val = LLVMBuildAnd(bld->builder, val, mask, "");
879 /* sign = sign << shift */
880 sign = LLVMBuildShl(bld->builder, sign, shift, "");
881 /* res = val | sign */
882 res = LLVMBuildOr(bld->builder, val, sign, "");
883 /* res = reinterpret_cast<float>(res) */
884 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
885
886 return res;
887 }
888
889
890 /**
891 * Convert vector of (or scalar) int to vector of (or scalar) float.
892 */
893 LLVMValueRef
894 lp_build_int_to_float(struct lp_build_context *bld,
895 LLVMValueRef a)
896 {
897 const struct lp_type type = bld->type;
898 LLVMTypeRef vec_type = lp_build_vec_type(type);
899
900 assert(type.floating);
901
902 return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
903 }
904
905
906
907 enum lp_build_round_sse41_mode
908 {
909 LP_BUILD_ROUND_SSE41_NEAREST = 0,
910 LP_BUILD_ROUND_SSE41_FLOOR = 1,
911 LP_BUILD_ROUND_SSE41_CEIL = 2,
912 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
913 };
914
915
916 static INLINE LLVMValueRef
917 lp_build_round_sse41(struct lp_build_context *bld,
918 LLVMValueRef a,
919 enum lp_build_round_sse41_mode mode)
920 {
921 const struct lp_type type = bld->type;
922 LLVMTypeRef vec_type = lp_build_vec_type(type);
923 const char *intrinsic;
924
925 assert(type.floating);
926 assert(type.width*type.length == 128);
927 assert(lp_check_value(type, a));
928 assert(util_cpu_caps.has_sse4_1);
929
930 switch(type.width) {
931 case 32:
932 intrinsic = "llvm.x86.sse41.round.ps";
933 break;
934 case 64:
935 intrinsic = "llvm.x86.sse41.round.pd";
936 break;
937 default:
938 assert(0);
939 return bld->undef;
940 }
941
942 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
943 LLVMConstInt(LLVMInt32Type(), mode, 0));
944 }
945
946
947 /**
948 * Return the integer part of a float (vector) value. The returned value is
949 * a float (vector).
950 * Ex: trunc(-1.5) = 1.0
951 */
952 LLVMValueRef
953 lp_build_trunc(struct lp_build_context *bld,
954 LLVMValueRef a)
955 {
956 const struct lp_type type = bld->type;
957
958 assert(type.floating);
959 assert(lp_check_value(type, a));
960
961 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
962 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
963 else {
964 LLVMTypeRef vec_type = lp_build_vec_type(type);
965 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
966 LLVMValueRef res;
967 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
968 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
969 return res;
970 }
971 }
972
973
974 /**
975 * Return float (vector) rounded to nearest integer (vector). The returned
976 * value is a float (vector).
977 * Ex: round(0.9) = 1.0
978 * Ex: round(-1.5) = -2.0
979 */
980 LLVMValueRef
981 lp_build_round(struct lp_build_context *bld,
982 LLVMValueRef a)
983 {
984 const struct lp_type type = bld->type;
985
986 assert(type.floating);
987 assert(lp_check_value(type, a));
988
989 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
990 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
991 else {
992 LLVMTypeRef vec_type = lp_build_vec_type(type);
993 LLVMValueRef res;
994 res = lp_build_iround(bld, a);
995 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
996 return res;
997 }
998 }
999
1000
1001 /**
1002 * Return floor of float (vector), result is a float (vector)
1003 * Ex: floor(1.1) = 1.0
1004 * Ex: floor(-1.1) = -2.0
1005 */
1006 LLVMValueRef
1007 lp_build_floor(struct lp_build_context *bld,
1008 LLVMValueRef a)
1009 {
1010 const struct lp_type type = bld->type;
1011
1012 assert(type.floating);
1013 assert(lp_check_value(type, a));
1014
1015 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1016 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1017 else {
1018 LLVMTypeRef vec_type = lp_build_vec_type(type);
1019 LLVMValueRef res;
1020 res = lp_build_ifloor(bld, a);
1021 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1022 return res;
1023 }
1024 }
1025
1026
1027 /**
1028 * Return ceiling of float (vector), returning float (vector).
1029 * Ex: ceil( 1.1) = 2.0
1030 * Ex: ceil(-1.1) = -1.0
1031 */
1032 LLVMValueRef
1033 lp_build_ceil(struct lp_build_context *bld,
1034 LLVMValueRef a)
1035 {
1036 const struct lp_type type = bld->type;
1037
1038 assert(type.floating);
1039 assert(lp_check_value(type, a));
1040
1041 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128)
1042 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1043 else {
1044 LLVMTypeRef vec_type = lp_build_vec_type(type);
1045 LLVMValueRef res;
1046 res = lp_build_iceil(bld, a);
1047 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
1048 return res;
1049 }
1050 }
1051
1052
1053 /**
1054 * Return fractional part of 'a' computed as a - floor(a)
1055 * Typically used in texture coord arithmetic.
1056 */
1057 LLVMValueRef
1058 lp_build_fract(struct lp_build_context *bld,
1059 LLVMValueRef a)
1060 {
1061 assert(bld->type.floating);
1062 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1063 }
1064
1065
1066 /**
1067 * Return the integer part of a float (vector) value. The returned value is
1068 * an integer (vector).
1069 * Ex: itrunc(-1.5) = 1
1070 */
1071 LLVMValueRef
1072 lp_build_itrunc(struct lp_build_context *bld,
1073 LLVMValueRef a)
1074 {
1075 const struct lp_type type = bld->type;
1076 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1077
1078 assert(type.floating);
1079 assert(lp_check_value(type, a));
1080
1081 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
1082 }
1083
1084
1085 /**
1086 * Return float (vector) rounded to nearest integer (vector). The returned
1087 * value is an integer (vector).
1088 * Ex: iround(0.9) = 1
1089 * Ex: iround(-1.5) = -2
1090 */
1091 LLVMValueRef
1092 lp_build_iround(struct lp_build_context *bld,
1093 LLVMValueRef a)
1094 {
1095 const struct lp_type type = bld->type;
1096 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1097 LLVMValueRef res;
1098
1099 assert(type.floating);
1100
1101 assert(lp_check_value(type, a));
1102
1103 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1104 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1105 }
1106 else {
1107 LLVMTypeRef vec_type = lp_build_vec_type(type);
1108 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1109 LLVMValueRef sign;
1110 LLVMValueRef half;
1111
1112 /* get sign bit */
1113 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1114 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1115
1116 /* sign * 0.5 */
1117 half = lp_build_const_vec(type, 0.5);
1118 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
1119 half = LLVMBuildOr(bld->builder, sign, half, "");
1120 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
1121
1122 res = LLVMBuildFAdd(bld->builder, a, half, "");
1123 }
1124
1125 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
1126
1127 return res;
1128 }
1129
1130
1131 /**
1132 * Return floor of float (vector), result is an int (vector)
1133 * Ex: ifloor(1.1) = 1.0
1134 * Ex: ifloor(-1.1) = -2.0
1135 */
1136 LLVMValueRef
1137 lp_build_ifloor(struct lp_build_context *bld,
1138 LLVMValueRef a)
1139 {
1140 const struct lp_type type = bld->type;
1141 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1142 LLVMValueRef res;
1143
1144 assert(type.floating);
1145 assert(lp_check_value(type, a));
1146
1147 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1148 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1149 }
1150 else {
1151 /* Take the sign bit and add it to 1 constant */
1152 LLVMTypeRef vec_type = lp_build_vec_type(type);
1153 unsigned mantissa = lp_mantissa(type);
1154 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1155 LLVMValueRef sign;
1156 LLVMValueRef offset;
1157
1158 /* sign = a < 0 ? ~0 : 0 */
1159 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1160 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1161 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
1162
1163 /* offset = -0.99999(9)f */
1164 offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1165 offset = LLVMConstBitCast(offset, int_vec_type);
1166
1167 /* offset = a < 0 ? offset : 0.0f */
1168 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1169 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
1170
1171 res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
1172 }
1173
1174 /* round to nearest (toward zero) */
1175 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
1176
1177 return res;
1178 }
1179
1180
1181 /**
1182 * Return ceiling of float (vector), returning int (vector).
1183 * Ex: iceil( 1.1) = 2
1184 * Ex: iceil(-1.1) = -1
1185 */
1186 LLVMValueRef
1187 lp_build_iceil(struct lp_build_context *bld,
1188 LLVMValueRef a)
1189 {
1190 const struct lp_type type = bld->type;
1191 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1192 LLVMValueRef res;
1193
1194 assert(type.floating);
1195 assert(lp_check_value(type, a));
1196
1197 if (util_cpu_caps.has_sse4_1 && type.width*type.length == 128) {
1198 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1199 }
1200 else {
1201 LLVMTypeRef vec_type = lp_build_vec_type(type);
1202 unsigned mantissa = lp_mantissa(type);
1203 LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
1204 LLVMValueRef sign;
1205 LLVMValueRef offset;
1206
1207 /* sign = a < 0 ? 0 : ~0 */
1208 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
1209 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
1210 sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
1211 sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
1212
1213 /* offset = 0.99999(9)f */
1214 offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1215 offset = LLVMConstBitCast(offset, int_vec_type);
1216
1217 /* offset = a < 0 ? 0.0 : offset */
1218 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
1219 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
1220
1221 res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
1222 }
1223
1224 /* round to nearest (toward zero) */
1225 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
1226
1227 return res;
1228 }
1229
1230
1231 LLVMValueRef
1232 lp_build_sqrt(struct lp_build_context *bld,
1233 LLVMValueRef a)
1234 {
1235 const struct lp_type type = bld->type;
1236 LLVMTypeRef vec_type = lp_build_vec_type(type);
1237 char intrinsic[32];
1238
1239 assert(lp_check_value(type, a));
1240
1241 /* TODO: optimize the constant case */
1242 /* TODO: optimize the constant case */
1243
1244 assert(type.floating);
1245 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1246
1247 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1248 }
1249
1250
1251 LLVMValueRef
1252 lp_build_rcp(struct lp_build_context *bld,
1253 LLVMValueRef a)
1254 {
1255 const struct lp_type type = bld->type;
1256
1257 assert(lp_check_value(type, a));
1258
1259 if(a == bld->zero)
1260 return bld->undef;
1261 if(a == bld->one)
1262 return bld->one;
1263 if(a == bld->undef)
1264 return bld->undef;
1265
1266 assert(type.floating);
1267
1268 if(LLVMIsConstant(a))
1269 return LLVMConstFDiv(bld->one, a);
1270
1271 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1272 /*
1273 * XXX: Added precision is not always necessary, so only enable this
1274 * when we have a better system in place to track minimum precision.
1275 */
1276
1277 #if 1
1278 /*
1279 * Do one Newton-Raphson step to improve precision:
1280 *
1281 * x1 = (2 - a * rcp(a)) * rcp(a)
1282 */
1283
1284 LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
1285 LLVMValueRef rcp_a;
1286 LLVMValueRef res;
1287
1288 rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1289
1290 res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
1291 res = LLVMBuildFSub(bld->builder, two, res, "");
1292 res = LLVMBuildFMul(bld->builder, res, rcp_a, "");
1293
1294 return res;
1295 #else
1296 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
1297 #endif
1298 }
1299
1300 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
1301 }
1302
1303
1304 /**
1305 * Generate 1/sqrt(a)
1306 */
1307 LLVMValueRef
1308 lp_build_rsqrt(struct lp_build_context *bld,
1309 LLVMValueRef a)
1310 {
1311 const struct lp_type type = bld->type;
1312
1313 assert(lp_check_value(type, a));
1314
1315 assert(type.floating);
1316
1317 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
1318 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
1319
1320 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1321 }
1322
1323
1324 static inline LLVMValueRef
1325 lp_build_const_v4si(unsigned long value)
1326 {
1327 LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
1328 LLVMValueRef elements[4] = { element, element, element, element };
1329 return LLVMConstVector(elements, 4);
1330 }
1331
1332 static inline LLVMValueRef
1333 lp_build_const_v4sf(float value)
1334 {
1335 LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
1336 LLVMValueRef elements[4] = { element, element, element, element };
1337 return LLVMConstVector(elements, 4);
1338 }
1339
1340
1341 /**
1342 * Generate sin(a) using SSE2
1343 */
1344 LLVMValueRef
1345 lp_build_sin(struct lp_build_context *bld,
1346 LLVMValueRef a)
1347 {
1348 struct lp_type int_type = lp_int_type(bld->type);
1349 LLVMBuilderRef b = bld->builder;
1350 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1351 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1352
1353 /*
1354 * take the absolute value,
1355 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1356 */
1357
1358 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1359 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1360
1361 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1362 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1363
1364 /*
1365 * extract the sign bit (upper one)
1366 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1367 */
1368 LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
1369 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1370
1371 /*
1372 * scale by 4/Pi
1373 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1374 */
1375
1376 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1377 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1378
1379 /*
1380 * store the integer part of y in mm0
1381 * emm2 = _mm_cvttps_epi32(y);
1382 */
1383
1384 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1385
1386 /*
1387 * j=(j+1) & (~1) (see the cephes sources)
1388 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1389 */
1390
1391 LLVMValueRef all_one = lp_build_const_v4si(1);
1392 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1393 /*
1394 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1395 */
1396 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1397 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1398
1399 /*
1400 * y = _mm_cvtepi32_ps(emm2);
1401 */
1402 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1403
1404 /* get the swap sign flag
1405 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1406 */
1407 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1408 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1409
1410 /*
1411 * emm2 = _mm_slli_epi32(emm0, 29);
1412 */
1413 LLVMValueRef const_29 = lp_build_const_v4si(29);
1414 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1415
1416 /*
1417 * get the polynom selection mask
1418 * there is one polynom for 0 <= x <= Pi/4
1419 * and another one for Pi/4<x<=Pi/2
1420 * Both branches will be computed.
1421 *
1422 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1423 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1424 */
1425
1426 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1427 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1428 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1429 emm2_3, lp_build_const_v4si(0));
1430 /*
1431 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1432 */
1433 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1434
1435 /*
1436 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1437 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1438 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1439 */
1440 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1441 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1442 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1443
1444 /*
1445 * The magic pass: "Extended precision modular arithmetic"
1446 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1447 * xmm1 = _mm_mul_ps(y, xmm1);
1448 * xmm2 = _mm_mul_ps(y, xmm2);
1449 * xmm3 = _mm_mul_ps(y, xmm3);
1450 */
1451 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1452 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1453 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1454
1455 /*
1456 * x = _mm_add_ps(x, xmm1);
1457 * x = _mm_add_ps(x, xmm2);
1458 * x = _mm_add_ps(x, xmm3);
1459 */
1460
1461 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1462 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1463 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1464
1465 /*
1466 * Evaluate the first polynom (0 <= x <= Pi/4)
1467 *
1468 * z = _mm_mul_ps(x,x);
1469 */
1470 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1471
1472 /*
1473 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1474 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1475 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1476 */
1477 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1478 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1479 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1480
1481 /*
1482 * y = *(v4sf*)_ps_coscof_p0;
1483 * y = _mm_mul_ps(y, z);
1484 */
1485 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1486 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1487 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1488 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1489 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1490 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1491
1492
1493 /*
1494 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1495 * y = _mm_sub_ps(y, tmp);
1496 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1497 */
1498 LLVMValueRef half = lp_build_const_v4sf(0.5);
1499 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1500 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1501 LLVMValueRef one = lp_build_const_v4sf(1.0);
1502 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1503
1504 /*
1505 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1506 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1507 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1508 */
1509 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1510 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1511 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1512
1513 /*
1514 * Evaluate the second polynom (Pi/4 <= x <= 0)
1515 *
1516 * y2 = *(v4sf*)_ps_sincof_p0;
1517 * y2 = _mm_mul_ps(y2, z);
1518 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1519 * y2 = _mm_mul_ps(y2, z);
1520 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1521 * y2 = _mm_mul_ps(y2, z);
1522 * y2 = _mm_mul_ps(y2, x);
1523 * y2 = _mm_add_ps(y2, x);
1524 */
1525
1526 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1527 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1528 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1529 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1530 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1531 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1532 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1533
1534 /*
1535 * select the correct result from the two polynoms
1536 * xmm3 = poly_mask;
1537 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1538 * y = _mm_andnot_ps(xmm3, y);
1539 * y = _mm_add_ps(y,y2);
1540 */
1541 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1542 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1543 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1544 LLVMValueRef inv = lp_build_const_v4si(~0);
1545 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1546 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1547 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1548
1549 /*
1550 * update the sign
1551 * y = _mm_xor_ps(y, sign_bit);
1552 */
1553 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1554 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1555 return y_result;
1556 }
1557
1558
1559 /**
1560 * Generate cos(a) using SSE2
1561 */
1562 LLVMValueRef
1563 lp_build_cos(struct lp_build_context *bld,
1564 LLVMValueRef a)
1565 {
1566 struct lp_type int_type = lp_int_type(bld->type);
1567 LLVMBuilderRef b = bld->builder;
1568 LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
1569 LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
1570
1571 /*
1572 * take the absolute value,
1573 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1574 */
1575
1576 LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
1577 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
1578
1579 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1580 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
1581
1582 /*
1583 * scale by 4/Pi
1584 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1585 */
1586
1587 LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
1588 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1589
1590 /*
1591 * store the integer part of y in mm0
1592 * emm2 = _mm_cvttps_epi32(y);
1593 */
1594
1595 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
1596
1597 /*
1598 * j=(j+1) & (~1) (see the cephes sources)
1599 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1600 */
1601
1602 LLVMValueRef all_one = lp_build_const_v4si(1);
1603 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1604 /*
1605 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1606 */
1607 LLVMValueRef inv_one = lp_build_const_v4si(~1);
1608 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1609
1610 /*
1611 * y = _mm_cvtepi32_ps(emm2);
1612 */
1613 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
1614
1615
1616 /*
1617 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1618 */
1619 LLVMValueRef const_2 = lp_build_const_v4si(2);
1620 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1621
1622
1623 /* get the swap sign flag
1624 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1625 */
1626 LLVMValueRef inv = lp_build_const_v4si(~0);
1627 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1628 LLVMValueRef pi32_4 = lp_build_const_v4si(4);
1629 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1630
1631 /*
1632 * emm2 = _mm_slli_epi32(emm0, 29);
1633 */
1634 LLVMValueRef const_29 = lp_build_const_v4si(29);
1635 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1636
1637 /*
1638 * get the polynom selection mask
1639 * there is one polynom for 0 <= x <= Pi/4
1640 * and another one for Pi/4<x<=Pi/2
1641 * Both branches will be computed.
1642 *
1643 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1644 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1645 */
1646
1647 LLVMValueRef pi32_2 = lp_build_const_v4si(2);
1648 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1649 LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
1650 emm2_3, lp_build_const_v4si(0));
1651
1652 /*
1653 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1654 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1655 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1656 */
1657 LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
1658 LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
1659 LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
1660
1661 /*
1662 * The magic pass: "Extended precision modular arithmetic"
1663 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1664 * xmm1 = _mm_mul_ps(y, xmm1);
1665 * xmm2 = _mm_mul_ps(y, xmm2);
1666 * xmm3 = _mm_mul_ps(y, xmm3);
1667 */
1668 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1669 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1670 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1671
1672 /*
1673 * x = _mm_add_ps(x, xmm1);
1674 * x = _mm_add_ps(x, xmm2);
1675 * x = _mm_add_ps(x, xmm3);
1676 */
1677
1678 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1679 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1680 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1681
1682 /*
1683 * Evaluate the first polynom (0 <= x <= Pi/4)
1684 *
1685 * z = _mm_mul_ps(x,x);
1686 */
1687 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1688
1689 /*
1690 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1691 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1692 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1693 */
1694 LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
1695 LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
1696 LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
1697
1698 /*
1699 * y = *(v4sf*)_ps_coscof_p0;
1700 * y = _mm_mul_ps(y, z);
1701 */
1702 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1703 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1704 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1705 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1706 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1707 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1708
1709
1710 /*
1711 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1712 * y = _mm_sub_ps(y, tmp);
1713 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1714 */
1715 LLVMValueRef half = lp_build_const_v4sf(0.5);
1716 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1717 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1718 LLVMValueRef one = lp_build_const_v4sf(1.0);
1719 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1720
1721 /*
1722 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1723 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1724 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1725 */
1726 LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
1727 LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
1728 LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
1729
1730 /*
1731 * Evaluate the second polynom (Pi/4 <= x <= 0)
1732 *
1733 * y2 = *(v4sf*)_ps_sincof_p0;
1734 * y2 = _mm_mul_ps(y2, z);
1735 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1736 * y2 = _mm_mul_ps(y2, z);
1737 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1738 * y2 = _mm_mul_ps(y2, z);
1739 * y2 = _mm_mul_ps(y2, x);
1740 * y2 = _mm_add_ps(y2, x);
1741 */
1742
1743 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1744 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1745 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1746 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1747 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1748 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1749 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1750
1751 /*
1752 * select the correct result from the two polynoms
1753 * xmm3 = poly_mask;
1754 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1755 * y = _mm_andnot_ps(xmm3, y);
1756 * y = _mm_add_ps(y,y2);
1757 */
1758 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
1759 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
1760 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1761 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1762 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1763 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1764
1765 /*
1766 * update the sign
1767 * y = _mm_xor_ps(y, sign_bit);
1768 */
1769 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
1770 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
1771 return y_result;
1772 }
1773
1774
1775 /**
1776 * Generate pow(x, y)
1777 */
1778 LLVMValueRef
1779 lp_build_pow(struct lp_build_context *bld,
1780 LLVMValueRef x,
1781 LLVMValueRef y)
1782 {
1783 /* TODO: optimize the constant case */
1784 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1785 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1786 __FUNCTION__);
1787
1788 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1789 }
1790
1791
1792 /**
1793 * Generate exp(x)
1794 */
1795 LLVMValueRef
1796 lp_build_exp(struct lp_build_context *bld,
1797 LLVMValueRef x)
1798 {
1799 /* log2(e) = 1/log(2) */
1800 LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
1801
1802 assert(lp_check_value(bld->type, x));
1803
1804 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1805 }
1806
1807
1808 /**
1809 * Generate log(x)
1810 */
1811 LLVMValueRef
1812 lp_build_log(struct lp_build_context *bld,
1813 LLVMValueRef x)
1814 {
1815 /* log(2) */
1816 LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
1817
1818 assert(lp_check_value(bld->type, x));
1819
1820 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1821 }
1822
1823
1824 #define EXP_POLY_DEGREE 3
1825 #define LOG_POLY_DEGREE 5
1826
1827
1828 /**
1829 * Generate polynomial.
1830 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
1831 */
1832 static LLVMValueRef
1833 lp_build_polynomial(struct lp_build_context *bld,
1834 LLVMValueRef x,
1835 const double *coeffs,
1836 unsigned num_coeffs)
1837 {
1838 const struct lp_type type = bld->type;
1839 LLVMValueRef res = NULL;
1840 unsigned i;
1841
1842 assert(lp_check_value(bld->type, x));
1843
1844 /* TODO: optimize the constant case */
1845 if(LLVMIsConstant(x))
1846 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1847 __FUNCTION__);
1848
1849 for (i = num_coeffs; i--; ) {
1850 LLVMValueRef coeff;
1851
1852 coeff = lp_build_const_vec(type, coeffs[i]);
1853
1854 if(res)
1855 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1856 else
1857 res = coeff;
1858 }
1859
1860 if(res)
1861 return res;
1862 else
1863 return bld->undef;
1864 }
1865
1866
1867 /**
1868 * Minimax polynomial fit of 2**x, in range [0, 1[
1869 */
1870 const double lp_build_exp2_polynomial[] = {
1871 #if EXP_POLY_DEGREE == 5
1872 0.999999999690134838155,
1873 0.583974334321735217258,
1874 0.164553105719676828492,
1875 0.0292811063701710962255,
1876 0.00354944426657875141846,
1877 0.000296253726543423377365
1878 #elif EXP_POLY_DEGREE == 4
1879 1.00000001502262084505,
1880 0.563586057338685991394,
1881 0.150436017652442413623,
1882 0.0243220604213317927308,
1883 0.0025359088446580436489
1884 #elif EXP_POLY_DEGREE == 3
1885 0.999925218562710312959,
1886 0.695833540494823811697,
1887 0.226067155427249155588,
1888 0.0780245226406372992967
1889 #elif EXP_POLY_DEGREE == 2
1890 1.00172476321474503578,
1891 0.657636275736077639316,
1892 0.33718943461968720704
1893 #else
1894 #error
1895 #endif
1896 };
1897
1898
1899 void
1900 lp_build_exp2_approx(struct lp_build_context *bld,
1901 LLVMValueRef x,
1902 LLVMValueRef *p_exp2_int_part,
1903 LLVMValueRef *p_frac_part,
1904 LLVMValueRef *p_exp2)
1905 {
1906 const struct lp_type type = bld->type;
1907 LLVMTypeRef vec_type = lp_build_vec_type(type);
1908 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1909 LLVMValueRef ipart = NULL;
1910 LLVMValueRef fpart = NULL;
1911 LLVMValueRef expipart = NULL;
1912 LLVMValueRef expfpart = NULL;
1913 LLVMValueRef res = NULL;
1914
1915 assert(lp_check_value(bld->type, x));
1916
1917 if(p_exp2_int_part || p_frac_part || p_exp2) {
1918 /* TODO: optimize the constant case */
1919 if(LLVMIsConstant(x))
1920 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1921 __FUNCTION__);
1922
1923 assert(type.floating && type.width == 32);
1924
1925 x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0));
1926 x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
1927
1928 /* ipart = floor(x) */
1929 ipart = lp_build_floor(bld, x);
1930
1931 /* fpart = x - ipart */
1932 fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
1933 }
1934
1935 if(p_exp2_int_part || p_exp2) {
1936 /* expipart = (float) (1 << ipart) */
1937 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1938 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
1939 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
1940 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1941 }
1942
1943 if(p_exp2) {
1944 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1945 Elements(lp_build_exp2_polynomial));
1946
1947 res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
1948 }
1949
1950 if(p_exp2_int_part)
1951 *p_exp2_int_part = expipart;
1952
1953 if(p_frac_part)
1954 *p_frac_part = fpart;
1955
1956 if(p_exp2)
1957 *p_exp2 = res;
1958 }
1959
1960
1961 LLVMValueRef
1962 lp_build_exp2(struct lp_build_context *bld,
1963 LLVMValueRef x)
1964 {
1965 LLVMValueRef res;
1966 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1967 return res;
1968 }
1969
1970
1971 /**
1972 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1973 * These coefficients can be generate with
1974 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1975 */
1976 const double lp_build_log2_polynomial[] = {
1977 #if LOG_POLY_DEGREE == 6
1978 3.11578814719469302614,
1979 -3.32419399085241980044,
1980 2.59883907202499966007,
1981 -1.23152682416275988241,
1982 0.318212422185251071475,
1983 -0.0344359067839062357313
1984 #elif LOG_POLY_DEGREE == 5
1985 2.8882704548164776201,
1986 -2.52074962577807006663,
1987 1.48116647521213171641,
1988 -0.465725644288844778798,
1989 0.0596515482674574969533
1990 #elif LOG_POLY_DEGREE == 4
1991 2.61761038894603480148,
1992 -1.75647175389045657003,
1993 0.688243882994381274313,
1994 -0.107254423828329604454
1995 #elif LOG_POLY_DEGREE == 3
1996 2.28330284476918490682,
1997 -1.04913055217340124191,
1998 0.204446009836232697516
1999 #else
2000 #error
2001 #endif
2002 };
2003
2004
2005 /**
2006 * See http://www.devmaster.net/forums/showthread.php?p=43580
2007 */
2008 void
2009 lp_build_log2_approx(struct lp_build_context *bld,
2010 LLVMValueRef x,
2011 LLVMValueRef *p_exp,
2012 LLVMValueRef *p_floor_log2,
2013 LLVMValueRef *p_log2)
2014 {
2015 const struct lp_type type = bld->type;
2016 LLVMTypeRef vec_type = lp_build_vec_type(type);
2017 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
2018
2019 LLVMValueRef expmask = lp_build_const_int_vec(type, 0x7f800000);
2020 LLVMValueRef mantmask = lp_build_const_int_vec(type, 0x007fffff);
2021 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2022
2023 LLVMValueRef i = NULL;
2024 LLVMValueRef exp = NULL;
2025 LLVMValueRef mant = NULL;
2026 LLVMValueRef logexp = NULL;
2027 LLVMValueRef logmant = NULL;
2028 LLVMValueRef res = NULL;
2029
2030 assert(lp_check_value(bld->type, x));
2031
2032 if(p_exp || p_floor_log2 || p_log2) {
2033 /* TODO: optimize the constant case */
2034 if(LLVMIsConstant(x))
2035 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2036 __FUNCTION__);
2037
2038 assert(type.floating && type.width == 32);
2039
2040 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
2041
2042 /* exp = (float) exponent(x) */
2043 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
2044 }
2045
2046 if(p_floor_log2 || p_log2) {
2047 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_const_int_vec(type, 23), "");
2048 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_const_int_vec(type, 127), "");
2049 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
2050 }
2051
2052 if(p_log2) {
2053 /* mant = (float) mantissa(x) */
2054 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
2055 mant = LLVMBuildOr(bld->builder, mant, one, "");
2056 mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
2057
2058 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2059 Elements(lp_build_log2_polynomial));
2060
2061 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2062 logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
2063
2064 res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
2065 }
2066
2067 if(p_exp) {
2068 exp = LLVMBuildBitCast(bld->builder, exp, vec_type, "");
2069 *p_exp = exp;
2070 }
2071
2072 if(p_floor_log2)
2073 *p_floor_log2 = logexp;
2074
2075 if(p_log2)
2076 *p_log2 = res;
2077 }
2078
2079
2080 LLVMValueRef
2081 lp_build_log2(struct lp_build_context *bld,
2082 LLVMValueRef x)
2083 {
2084 LLVMValueRef res;
2085 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2086 return res;
2087 }