llvmpipe: Don't assume vector is 4 wide in lp_build_sin()/lp_build_cos()
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_intr.h"
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
62
63
64 #define EXP_POLY_DEGREE 5
65
66 #define LOG_POLY_DEGREE 5
67
68
69 /**
70 * Generate min(a, b)
71 * No checks for special case values of a or b = 1 or 0 are done.
72 */
73 static LLVMValueRef
74 lp_build_min_simple(struct lp_build_context *bld,
75 LLVMValueRef a,
76 LLVMValueRef b)
77 {
78 LLVMBuilderRef builder = bld->gallivm->builder;
79 const struct lp_type type = bld->type;
80 const char *intrinsic = NULL;
81 LLVMValueRef cond;
82
83 assert(lp_check_value(type, a));
84 assert(lp_check_value(type, b));
85
86 /* TODO: optimize the constant case */
87
88 if(type.width * type.length == 128) {
89 if(type.floating) {
90 if(type.width == 32 && util_cpu_caps.has_sse)
91 intrinsic = "llvm.x86.sse.min.ps";
92 if(type.width == 64 && util_cpu_caps.has_sse2)
93 intrinsic = "llvm.x86.sse2.min.pd";
94 }
95 else {
96 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
97 intrinsic = "llvm.x86.sse2.pminu.b";
98 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminsb";
100 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
101 intrinsic = "llvm.x86.sse41.pminuw";
102 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
103 intrinsic = "llvm.x86.sse2.pmins.w";
104 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
105 intrinsic = "llvm.x86.sse41.pminud";
106 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
107 intrinsic = "llvm.x86.sse41.pminsd";
108 }
109 }
110
111 if(intrinsic)
112 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
113
114 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
115 return lp_build_select(bld, cond, a, b);
116 }
117
118
119 /**
120 * Generate max(a, b)
121 * No checks for special case values of a or b = 1 or 0 are done.
122 */
123 static LLVMValueRef
124 lp_build_max_simple(struct lp_build_context *bld,
125 LLVMValueRef a,
126 LLVMValueRef b)
127 {
128 LLVMBuilderRef builder = bld->gallivm->builder;
129 const struct lp_type type = bld->type;
130 const char *intrinsic = NULL;
131 LLVMValueRef cond;
132
133 assert(lp_check_value(type, a));
134 assert(lp_check_value(type, b));
135
136 /* TODO: optimize the constant case */
137
138 if(type.width * type.length == 128) {
139 if(type.floating) {
140 if(type.width == 32 && util_cpu_caps.has_sse)
141 intrinsic = "llvm.x86.sse.max.ps";
142 if(type.width == 64 && util_cpu_caps.has_sse2)
143 intrinsic = "llvm.x86.sse2.max.pd";
144 }
145 else {
146 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
147 intrinsic = "llvm.x86.sse2.pmaxu.b";
148 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
149 intrinsic = "llvm.x86.sse41.pmaxsb";
150 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
151 intrinsic = "llvm.x86.sse41.pmaxuw";
152 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
153 intrinsic = "llvm.x86.sse2.pmaxs.w";
154 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
155 intrinsic = "llvm.x86.sse41.pmaxud";
156 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
157 intrinsic = "llvm.x86.sse41.pmaxsd";
158 }
159 }
160
161 if(intrinsic)
162 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
163
164 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
165 return lp_build_select(bld, cond, a, b);
166 }
167
168
169 /**
170 * Generate 1 - a, or ~a depending on bld->type.
171 */
172 LLVMValueRef
173 lp_build_comp(struct lp_build_context *bld,
174 LLVMValueRef a)
175 {
176 LLVMBuilderRef builder = bld->gallivm->builder;
177 const struct lp_type type = bld->type;
178
179 assert(lp_check_value(type, a));
180
181 if(a == bld->one)
182 return bld->zero;
183 if(a == bld->zero)
184 return bld->one;
185
186 if(type.norm && !type.floating && !type.fixed && !type.sign) {
187 if(LLVMIsConstant(a))
188 return LLVMConstNot(a);
189 else
190 return LLVMBuildNot(builder, a, "");
191 }
192
193 if(LLVMIsConstant(a))
194 if (type.floating)
195 return LLVMConstFSub(bld->one, a);
196 else
197 return LLVMConstSub(bld->one, a);
198 else
199 if (type.floating)
200 return LLVMBuildFSub(builder, bld->one, a, "");
201 else
202 return LLVMBuildSub(builder, bld->one, a, "");
203 }
204
205
206 /**
207 * Generate a + b
208 */
209 LLVMValueRef
210 lp_build_add(struct lp_build_context *bld,
211 LLVMValueRef a,
212 LLVMValueRef b)
213 {
214 LLVMBuilderRef builder = bld->gallivm->builder;
215 const struct lp_type type = bld->type;
216 LLVMValueRef res;
217
218 assert(lp_check_value(type, a));
219 assert(lp_check_value(type, b));
220
221 if(a == bld->zero)
222 return b;
223 if(b == bld->zero)
224 return a;
225 if(a == bld->undef || b == bld->undef)
226 return bld->undef;
227
228 if(bld->type.norm) {
229 const char *intrinsic = NULL;
230
231 if(a == bld->one || b == bld->one)
232 return bld->one;
233
234 if(util_cpu_caps.has_sse2 &&
235 type.width * type.length == 128 &&
236 !type.floating && !type.fixed) {
237 if(type.width == 8)
238 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
239 if(type.width == 16)
240 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
241 }
242
243 if(intrinsic)
244 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
245 }
246
247 if(LLVMIsConstant(a) && LLVMIsConstant(b))
248 if (type.floating)
249 res = LLVMConstFAdd(a, b);
250 else
251 res = LLVMConstAdd(a, b);
252 else
253 if (type.floating)
254 res = LLVMBuildFAdd(builder, a, b, "");
255 else
256 res = LLVMBuildAdd(builder, a, b, "");
257
258 /* clamp to ceiling of 1.0 */
259 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
260 res = lp_build_min_simple(bld, res, bld->one);
261
262 /* XXX clamp to floor of -1 or 0??? */
263
264 return res;
265 }
266
267
268 /** Return the scalar sum of the elements of a */
269 LLVMValueRef
270 lp_build_sum_vector(struct lp_build_context *bld,
271 LLVMValueRef a)
272 {
273 LLVMBuilderRef builder = bld->gallivm->builder;
274 const struct lp_type type = bld->type;
275 LLVMValueRef index, res;
276 unsigned i;
277
278 assert(lp_check_value(type, a));
279
280 if (type.length == 1) {
281 return a;
282 }
283
284 assert(!bld->type.norm);
285
286 index = lp_build_const_int32(bld->gallivm, 0);
287 res = LLVMBuildExtractElement(builder, a, index, "");
288
289 for (i = 1; i < type.length; i++) {
290 index = lp_build_const_int32(bld->gallivm, i);
291 if (type.floating)
292 res = LLVMBuildFAdd(builder, res,
293 LLVMBuildExtractElement(builder,
294 a, index, ""),
295 "");
296 else
297 res = LLVMBuildAdd(builder, res,
298 LLVMBuildExtractElement(builder,
299 a, index, ""),
300 "");
301 }
302
303 return res;
304 }
305
306
307 /**
308 * Generate a - b
309 */
310 LLVMValueRef
311 lp_build_sub(struct lp_build_context *bld,
312 LLVMValueRef a,
313 LLVMValueRef b)
314 {
315 LLVMBuilderRef builder = bld->gallivm->builder;
316 const struct lp_type type = bld->type;
317 LLVMValueRef res;
318
319 assert(lp_check_value(type, a));
320 assert(lp_check_value(type, b));
321
322 if(b == bld->zero)
323 return a;
324 if(a == bld->undef || b == bld->undef)
325 return bld->undef;
326 if(a == b)
327 return bld->zero;
328
329 if(bld->type.norm) {
330 const char *intrinsic = NULL;
331
332 if(b == bld->one)
333 return bld->zero;
334
335 if(util_cpu_caps.has_sse2 &&
336 type.width * type.length == 128 &&
337 !type.floating && !type.fixed) {
338 if(type.width == 8)
339 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
340 if(type.width == 16)
341 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
342 }
343
344 if(intrinsic)
345 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
346 }
347
348 if(LLVMIsConstant(a) && LLVMIsConstant(b))
349 if (type.floating)
350 res = LLVMConstFSub(a, b);
351 else
352 res = LLVMConstSub(a, b);
353 else
354 if (type.floating)
355 res = LLVMBuildFSub(builder, a, b, "");
356 else
357 res = LLVMBuildSub(builder, a, b, "");
358
359 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
360 res = lp_build_max_simple(bld, res, bld->zero);
361
362 return res;
363 }
364
365
366 /**
367 * Normalized 8bit multiplication.
368 *
369 * - alpha plus one
370 *
371 * makes the following approximation to the division (Sree)
372 *
373 * a*b/255 ~= (a*(b + 1)) >> 256
374 *
375 * which is the fastest method that satisfies the following OpenGL criteria
376 *
377 * 0*0 = 0 and 255*255 = 255
378 *
379 * - geometric series
380 *
381 * takes the geometric series approximation to the division
382 *
383 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
384 *
385 * in this case just the first two terms to fit in 16bit arithmetic
386 *
387 * t/255 ~= (t + (t >> 8)) >> 8
388 *
389 * note that just by itself it doesn't satisfies the OpenGL criteria, as
390 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
391 * must be used
392 *
393 * - geometric series plus rounding
394 *
395 * when using a geometric series division instead of truncating the result
396 * use roundoff in the approximation (Jim Blinn)
397 *
398 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
399 *
400 * achieving the exact results
401 *
402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
403 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
404 * @sa Michael Herf, The "double blend trick", May 2000,
405 * http://www.stereopsis.com/doubleblend.html
406 */
407 static LLVMValueRef
408 lp_build_mul_u8n(struct gallivm_state *gallivm,
409 struct lp_type i16_type,
410 LLVMValueRef a, LLVMValueRef b)
411 {
412 LLVMBuilderRef builder = gallivm->builder;
413 LLVMValueRef c8;
414 LLVMValueRef ab;
415
416 assert(!i16_type.floating);
417 assert(lp_check_value(i16_type, a));
418 assert(lp_check_value(i16_type, b));
419
420 c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
421
422 #if 0
423
424 /* a*b/255 ~= (a*(b + 1)) >> 256 */
425 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
426 ab = LLVMBuildMul(builder, a, b, "");
427
428 #else
429
430 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
431 ab = LLVMBuildMul(builder, a, b, "");
432 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
433 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
434
435 #endif
436
437 ab = LLVMBuildLShr(builder, ab, c8, "");
438
439 return ab;
440 }
441
442
443 /**
444 * Generate a * b
445 */
446 LLVMValueRef
447 lp_build_mul(struct lp_build_context *bld,
448 LLVMValueRef a,
449 LLVMValueRef b)
450 {
451 LLVMBuilderRef builder = bld->gallivm->builder;
452 const struct lp_type type = bld->type;
453 LLVMValueRef shift;
454 LLVMValueRef res;
455
456 assert(lp_check_value(type, a));
457 assert(lp_check_value(type, b));
458
459 if(a == bld->zero)
460 return bld->zero;
461 if(a == bld->one)
462 return b;
463 if(b == bld->zero)
464 return bld->zero;
465 if(b == bld->one)
466 return a;
467 if(a == bld->undef || b == bld->undef)
468 return bld->undef;
469
470 if(!type.floating && !type.fixed && type.norm) {
471 if(type.width == 8) {
472 struct lp_type i16_type = lp_wider_type(type);
473 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
474
475 lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
476 lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
477
478 /* PMULLW, PSRLW, PADDW */
479 abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
480 abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
481
482 ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
483
484 return ab;
485 }
486
487 /* FIXME */
488 assert(0);
489 }
490
491 if(type.fixed)
492 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
493 else
494 shift = NULL;
495
496 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
497 if (type.floating)
498 res = LLVMConstFMul(a, b);
499 else
500 res = LLVMConstMul(a, b);
501 if(shift) {
502 if(type.sign)
503 res = LLVMConstAShr(res, shift);
504 else
505 res = LLVMConstLShr(res, shift);
506 }
507 }
508 else {
509 if (type.floating)
510 res = LLVMBuildFMul(builder, a, b, "");
511 else
512 res = LLVMBuildMul(builder, a, b, "");
513 if(shift) {
514 if(type.sign)
515 res = LLVMBuildAShr(builder, res, shift, "");
516 else
517 res = LLVMBuildLShr(builder, res, shift, "");
518 }
519 }
520
521 return res;
522 }
523
524
525 /**
526 * Small vector x scale multiplication optimization.
527 */
528 LLVMValueRef
529 lp_build_mul_imm(struct lp_build_context *bld,
530 LLVMValueRef a,
531 int b)
532 {
533 LLVMBuilderRef builder = bld->gallivm->builder;
534 LLVMValueRef factor;
535
536 assert(lp_check_value(bld->type, a));
537
538 if(b == 0)
539 return bld->zero;
540
541 if(b == 1)
542 return a;
543
544 if(b == -1)
545 return lp_build_negate(bld, a);
546
547 if(b == 2 && bld->type.floating)
548 return lp_build_add(bld, a, a);
549
550 if(util_is_power_of_two(b)) {
551 unsigned shift = ffs(b) - 1;
552
553 if(bld->type.floating) {
554 #if 0
555 /*
556 * Power of two multiplication by directly manipulating the mantissa.
557 *
558 * XXX: This might not be always faster, it will introduce a small error
559 * for multiplication by zero, and it will produce wrong results
560 * for Inf and NaN.
561 */
562 unsigned mantissa = lp_mantissa(bld->type);
563 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
564 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
565 a = LLVMBuildAdd(builder, a, factor, "");
566 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
567 return a;
568 #endif
569 }
570 else {
571 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
572 return LLVMBuildShl(builder, a, factor, "");
573 }
574 }
575
576 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
577 return lp_build_mul(bld, a, factor);
578 }
579
580
581 /**
582 * Generate a / b
583 */
584 LLVMValueRef
585 lp_build_div(struct lp_build_context *bld,
586 LLVMValueRef a,
587 LLVMValueRef b)
588 {
589 LLVMBuilderRef builder = bld->gallivm->builder;
590 const struct lp_type type = bld->type;
591
592 assert(lp_check_value(type, a));
593 assert(lp_check_value(type, b));
594
595 if(a == bld->zero)
596 return bld->zero;
597 if(a == bld->one)
598 return lp_build_rcp(bld, b);
599 if(b == bld->zero)
600 return bld->undef;
601 if(b == bld->one)
602 return a;
603 if(a == bld->undef || b == bld->undef)
604 return bld->undef;
605
606 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
607 if (type.floating)
608 return LLVMConstFDiv(a, b);
609 else if (type.sign)
610 return LLVMConstSDiv(a, b);
611 else
612 return LLVMConstUDiv(a, b);
613 }
614
615 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
616 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
617
618 if (type.floating)
619 return LLVMBuildFDiv(builder, a, b, "");
620 else if (type.sign)
621 return LLVMBuildSDiv(builder, a, b, "");
622 else
623 return LLVMBuildUDiv(builder, a, b, "");
624 }
625
626
627 /**
628 * Linear interpolation -- without any checks.
629 *
630 * @sa http://www.stereopsis.com/doubleblend.html
631 */
632 static INLINE LLVMValueRef
633 lp_build_lerp_simple(struct lp_build_context *bld,
634 LLVMValueRef x,
635 LLVMValueRef v0,
636 LLVMValueRef v1)
637 {
638 LLVMBuilderRef builder = bld->gallivm->builder;
639 LLVMValueRef delta;
640 LLVMValueRef res;
641
642 assert(lp_check_value(bld->type, x));
643 assert(lp_check_value(bld->type, v0));
644 assert(lp_check_value(bld->type, v1));
645
646 delta = lp_build_sub(bld, v1, v0);
647
648 res = lp_build_mul(bld, x, delta);
649
650 res = lp_build_add(bld, v0, res);
651
652 if (bld->type.fixed) {
653 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
654 * but it will be wrong for other uses. Basically we need a more
655 * powerful lp_type, capable of further distinguishing the values
656 * interpretation from the value storage. */
657 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
658 }
659
660 return res;
661 }
662
663
664 /**
665 * Linear interpolation.
666 */
667 LLVMValueRef
668 lp_build_lerp(struct lp_build_context *bld,
669 LLVMValueRef x,
670 LLVMValueRef v0,
671 LLVMValueRef v1)
672 {
673 LLVMBuilderRef builder = bld->gallivm->builder;
674 const struct lp_type type = bld->type;
675 LLVMValueRef res;
676
677 assert(lp_check_value(type, x));
678 assert(lp_check_value(type, v0));
679 assert(lp_check_value(type, v1));
680
681 if (type.norm) {
682 struct lp_type wide_type;
683 struct lp_build_context wide_bld;
684 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
685 LLVMValueRef shift;
686
687 assert(type.length >= 2);
688 assert(!type.sign);
689
690 /*
691 * Create a wider type, enough to hold the intermediate result of the
692 * multiplication.
693 */
694 memset(&wide_type, 0, sizeof wide_type);
695 wide_type.fixed = TRUE;
696 wide_type.width = type.width*2;
697 wide_type.length = type.length/2;
698
699 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
700
701 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
702 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
703 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
704
705 /*
706 * Scale x from [0, 255] to [0, 256]
707 */
708
709 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
710
711 xl = lp_build_add(&wide_bld, xl,
712 LLVMBuildAShr(builder, xl, shift, ""));
713 xh = lp_build_add(&wide_bld, xh,
714 LLVMBuildAShr(builder, xh, shift, ""));
715
716 /*
717 * Lerp both halves.
718 */
719
720 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
721 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
722
723 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
724 } else {
725 res = lp_build_lerp_simple(bld, x, v0, v1);
726 }
727
728 return res;
729 }
730
731
732 LLVMValueRef
733 lp_build_lerp_2d(struct lp_build_context *bld,
734 LLVMValueRef x,
735 LLVMValueRef y,
736 LLVMValueRef v00,
737 LLVMValueRef v01,
738 LLVMValueRef v10,
739 LLVMValueRef v11)
740 {
741 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
742 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
743 return lp_build_lerp(bld, y, v0, v1);
744 }
745
746
747 /**
748 * Generate min(a, b)
749 * Do checks for special cases.
750 */
751 LLVMValueRef
752 lp_build_min(struct lp_build_context *bld,
753 LLVMValueRef a,
754 LLVMValueRef b)
755 {
756 assert(lp_check_value(bld->type, a));
757 assert(lp_check_value(bld->type, b));
758
759 if(a == bld->undef || b == bld->undef)
760 return bld->undef;
761
762 if(a == b)
763 return a;
764
765 if(bld->type.norm) {
766 if(a == bld->zero || b == bld->zero)
767 return bld->zero;
768 if(a == bld->one)
769 return b;
770 if(b == bld->one)
771 return a;
772 }
773
774 return lp_build_min_simple(bld, a, b);
775 }
776
777
778 /**
779 * Generate max(a, b)
780 * Do checks for special cases.
781 */
782 LLVMValueRef
783 lp_build_max(struct lp_build_context *bld,
784 LLVMValueRef a,
785 LLVMValueRef b)
786 {
787 assert(lp_check_value(bld->type, a));
788 assert(lp_check_value(bld->type, b));
789
790 if(a == bld->undef || b == bld->undef)
791 return bld->undef;
792
793 if(a == b)
794 return a;
795
796 if(bld->type.norm) {
797 if(a == bld->one || b == bld->one)
798 return bld->one;
799 if(a == bld->zero)
800 return b;
801 if(b == bld->zero)
802 return a;
803 }
804
805 return lp_build_max_simple(bld, a, b);
806 }
807
808
809 /**
810 * Generate clamp(a, min, max)
811 * Do checks for special cases.
812 */
813 LLVMValueRef
814 lp_build_clamp(struct lp_build_context *bld,
815 LLVMValueRef a,
816 LLVMValueRef min,
817 LLVMValueRef max)
818 {
819 assert(lp_check_value(bld->type, a));
820 assert(lp_check_value(bld->type, min));
821 assert(lp_check_value(bld->type, max));
822
823 a = lp_build_min(bld, a, max);
824 a = lp_build_max(bld, a, min);
825 return a;
826 }
827
828
829 /**
830 * Generate abs(a)
831 */
832 LLVMValueRef
833 lp_build_abs(struct lp_build_context *bld,
834 LLVMValueRef a)
835 {
836 LLVMBuilderRef builder = bld->gallivm->builder;
837 const struct lp_type type = bld->type;
838 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
839
840 assert(lp_check_value(type, a));
841
842 if(!type.sign)
843 return a;
844
845 if(type.floating) {
846 /* Mask out the sign bit */
847 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
848 unsigned long long absMask = ~(1ULL << (type.width - 1));
849 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
850 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
851 a = LLVMBuildAnd(builder, a, mask, "");
852 a = LLVMBuildBitCast(builder, a, vec_type, "");
853 return a;
854 }
855
856 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
857 switch(type.width) {
858 case 8:
859 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
860 case 16:
861 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
862 case 32:
863 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
864 }
865 }
866
867 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
868 }
869
870
871 LLVMValueRef
872 lp_build_negate(struct lp_build_context *bld,
873 LLVMValueRef a)
874 {
875 LLVMBuilderRef builder = bld->gallivm->builder;
876
877 assert(lp_check_value(bld->type, a));
878
879 #if HAVE_LLVM >= 0x0207
880 if (bld->type.floating)
881 a = LLVMBuildFNeg(builder, a, "");
882 else
883 #endif
884 a = LLVMBuildNeg(builder, a, "");
885
886 return a;
887 }
888
889
890 /** Return -1, 0 or +1 depending on the sign of a */
891 LLVMValueRef
892 lp_build_sgn(struct lp_build_context *bld,
893 LLVMValueRef a)
894 {
895 LLVMBuilderRef builder = bld->gallivm->builder;
896 const struct lp_type type = bld->type;
897 LLVMValueRef cond;
898 LLVMValueRef res;
899
900 assert(lp_check_value(type, a));
901
902 /* Handle non-zero case */
903 if(!type.sign) {
904 /* if not zero then sign must be positive */
905 res = bld->one;
906 }
907 else if(type.floating) {
908 LLVMTypeRef vec_type;
909 LLVMTypeRef int_type;
910 LLVMValueRef mask;
911 LLVMValueRef sign;
912 LLVMValueRef one;
913 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
914
915 int_type = lp_build_int_vec_type(bld->gallivm, type);
916 vec_type = lp_build_vec_type(bld->gallivm, type);
917 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
918
919 /* Take the sign bit and add it to 1 constant */
920 sign = LLVMBuildBitCast(builder, a, int_type, "");
921 sign = LLVMBuildAnd(builder, sign, mask, "");
922 one = LLVMConstBitCast(bld->one, int_type);
923 res = LLVMBuildOr(builder, sign, one, "");
924 res = LLVMBuildBitCast(builder, res, vec_type, "");
925 }
926 else
927 {
928 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
929 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
930 res = lp_build_select(bld, cond, bld->one, minus_one);
931 }
932
933 /* Handle zero */
934 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
935 res = lp_build_select(bld, cond, bld->zero, res);
936
937 return res;
938 }
939
940
941 /**
942 * Set the sign of float vector 'a' according to 'sign'.
943 * If sign==0, return abs(a).
944 * If sign==1, return -abs(a);
945 * Other values for sign produce undefined results.
946 */
947 LLVMValueRef
948 lp_build_set_sign(struct lp_build_context *bld,
949 LLVMValueRef a, LLVMValueRef sign)
950 {
951 LLVMBuilderRef builder = bld->gallivm->builder;
952 const struct lp_type type = bld->type;
953 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
954 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
955 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
956 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
957 ~((unsigned long long) 1 << (type.width - 1)));
958 LLVMValueRef val, res;
959
960 assert(type.floating);
961 assert(lp_check_value(type, a));
962
963 /* val = reinterpret_cast<int>(a) */
964 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
965 /* val = val & mask */
966 val = LLVMBuildAnd(builder, val, mask, "");
967 /* sign = sign << shift */
968 sign = LLVMBuildShl(builder, sign, shift, "");
969 /* res = val | sign */
970 res = LLVMBuildOr(builder, val, sign, "");
971 /* res = reinterpret_cast<float>(res) */
972 res = LLVMBuildBitCast(builder, res, vec_type, "");
973
974 return res;
975 }
976
977
978 /**
979 * Convert vector of (or scalar) int to vector of (or scalar) float.
980 */
981 LLVMValueRef
982 lp_build_int_to_float(struct lp_build_context *bld,
983 LLVMValueRef a)
984 {
985 LLVMBuilderRef builder = bld->gallivm->builder;
986 const struct lp_type type = bld->type;
987 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
988
989 assert(type.floating);
990
991 return LLVMBuildSIToFP(builder, a, vec_type, "");
992 }
993
994
995
996 enum lp_build_round_sse41_mode
997 {
998 LP_BUILD_ROUND_SSE41_NEAREST = 0,
999 LP_BUILD_ROUND_SSE41_FLOOR = 1,
1000 LP_BUILD_ROUND_SSE41_CEIL = 2,
1001 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1002 };
1003
1004
1005 /**
1006 * Helper for SSE4.1's ROUNDxx instructions.
1007 *
1008 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1009 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1010 */
1011 static INLINE LLVMValueRef
1012 lp_build_round_sse41(struct lp_build_context *bld,
1013 LLVMValueRef a,
1014 enum lp_build_round_sse41_mode mode)
1015 {
1016 LLVMBuilderRef builder = bld->gallivm->builder;
1017 const struct lp_type type = bld->type;
1018 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1019 const char *intrinsic;
1020 LLVMValueRef res;
1021
1022 assert(type.floating);
1023
1024 assert(lp_check_value(type, a));
1025 assert(util_cpu_caps.has_sse4_1);
1026
1027 if (type.length == 1) {
1028 LLVMTypeRef vec_type;
1029 LLVMValueRef undef;
1030 LLVMValueRef args[3];
1031 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1032
1033 switch(type.width) {
1034 case 32:
1035 intrinsic = "llvm.x86.sse41.round.ss";
1036 break;
1037 case 64:
1038 intrinsic = "llvm.x86.sse41.round.sd";
1039 break;
1040 default:
1041 assert(0);
1042 return bld->undef;
1043 }
1044
1045 vec_type = LLVMVectorType(bld->elem_type, 4);
1046
1047 undef = LLVMGetUndef(vec_type);
1048
1049 args[0] = undef;
1050 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1051 args[2] = LLVMConstInt(i32t, mode, 0);
1052
1053 res = lp_build_intrinsic(builder, intrinsic,
1054 vec_type, args, Elements(args));
1055
1056 res = LLVMBuildExtractElement(builder, res, index0, "");
1057 }
1058 else {
1059 assert(type.width*type.length == 128);
1060
1061 switch(type.width) {
1062 case 32:
1063 intrinsic = "llvm.x86.sse41.round.ps";
1064 break;
1065 case 64:
1066 intrinsic = "llvm.x86.sse41.round.pd";
1067 break;
1068 default:
1069 assert(0);
1070 return bld->undef;
1071 }
1072
1073 res = lp_build_intrinsic_binary(builder, intrinsic,
1074 bld->vec_type, a,
1075 LLVMConstInt(i32t, mode, 0));
1076 }
1077
1078 return res;
1079 }
1080
1081
1082 static INLINE LLVMValueRef
1083 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1084 LLVMValueRef a)
1085 {
1086 LLVMBuilderRef builder = bld->gallivm->builder;
1087 const struct lp_type type = bld->type;
1088 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1089 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1090 const char *intrinsic;
1091 LLVMValueRef res;
1092
1093 assert(type.floating);
1094 /* using the double precision conversions is a bit more complicated */
1095 assert(type.width == 32);
1096
1097 assert(lp_check_value(type, a));
1098 assert(util_cpu_caps.has_sse2);
1099
1100 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1101 if (type.length == 1) {
1102 LLVMTypeRef vec_type;
1103 LLVMValueRef undef;
1104 LLVMValueRef arg;
1105 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1106
1107 vec_type = LLVMVectorType(bld->elem_type, 4);
1108
1109 intrinsic = "llvm.x86.sse.cvtss2si";
1110
1111 undef = LLVMGetUndef(vec_type);
1112
1113 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1114
1115 res = lp_build_intrinsic_unary(builder, intrinsic,
1116 ret_type, arg);
1117 }
1118 else {
1119 assert(type.width*type.length == 128);
1120
1121 intrinsic = "llvm.x86.sse2.cvtps2dq";
1122
1123 res = lp_build_intrinsic_unary(builder, intrinsic,
1124 ret_type, a);
1125 }
1126
1127 return res;
1128 }
1129
1130
1131 /**
1132 * Return the integer part of a float (vector) value (== round toward zero).
1133 * The returned value is a float (vector).
1134 * Ex: trunc(-1.5) = -1.0
1135 */
1136 LLVMValueRef
1137 lp_build_trunc(struct lp_build_context *bld,
1138 LLVMValueRef a)
1139 {
1140 LLVMBuilderRef builder = bld->gallivm->builder;
1141 const struct lp_type type = bld->type;
1142
1143 assert(type.floating);
1144 assert(lp_check_value(type, a));
1145
1146 if (util_cpu_caps.has_sse4_1 &&
1147 (type.length == 1 || type.width*type.length == 128)) {
1148 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1149 }
1150 else {
1151 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1152 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1153 LLVMValueRef res;
1154 res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1155 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1156 return res;
1157 }
1158 }
1159
1160
1161 /**
1162 * Return float (vector) rounded to nearest integer (vector). The returned
1163 * value is a float (vector).
1164 * Ex: round(0.9) = 1.0
1165 * Ex: round(-1.5) = -2.0
1166 */
1167 LLVMValueRef
1168 lp_build_round(struct lp_build_context *bld,
1169 LLVMValueRef a)
1170 {
1171 LLVMBuilderRef builder = bld->gallivm->builder;
1172 const struct lp_type type = bld->type;
1173
1174 assert(type.floating);
1175 assert(lp_check_value(type, a));
1176
1177 if (util_cpu_caps.has_sse4_1 &&
1178 (type.length == 1 || type.width*type.length == 128)) {
1179 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1180 }
1181 else {
1182 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1183 LLVMValueRef res;
1184 res = lp_build_iround(bld, a);
1185 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1186 return res;
1187 }
1188 }
1189
1190
1191 /**
1192 * Return floor of float (vector), result is a float (vector)
1193 * Ex: floor(1.1) = 1.0
1194 * Ex: floor(-1.1) = -2.0
1195 */
1196 LLVMValueRef
1197 lp_build_floor(struct lp_build_context *bld,
1198 LLVMValueRef a)
1199 {
1200 LLVMBuilderRef builder = bld->gallivm->builder;
1201 const struct lp_type type = bld->type;
1202
1203 assert(type.floating);
1204 assert(lp_check_value(type, a));
1205
1206 if (util_cpu_caps.has_sse4_1 &&
1207 (type.length == 1 || type.width*type.length == 128)) {
1208 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1209 }
1210 else {
1211 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1212 LLVMValueRef res;
1213 res = lp_build_ifloor(bld, a);
1214 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1215 return res;
1216 }
1217 }
1218
1219
1220 /**
1221 * Return ceiling of float (vector), returning float (vector).
1222 * Ex: ceil( 1.1) = 2.0
1223 * Ex: ceil(-1.1) = -1.0
1224 */
1225 LLVMValueRef
1226 lp_build_ceil(struct lp_build_context *bld,
1227 LLVMValueRef a)
1228 {
1229 LLVMBuilderRef builder = bld->gallivm->builder;
1230 const struct lp_type type = bld->type;
1231
1232 assert(type.floating);
1233 assert(lp_check_value(type, a));
1234
1235 if (util_cpu_caps.has_sse4_1 &&
1236 (type.length == 1 || type.width*type.length == 128)) {
1237 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1238 }
1239 else {
1240 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1241 LLVMValueRef res;
1242 res = lp_build_iceil(bld, a);
1243 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1244 return res;
1245 }
1246 }
1247
1248
1249 /**
1250 * Return fractional part of 'a' computed as a - floor(a)
1251 * Typically used in texture coord arithmetic.
1252 */
1253 LLVMValueRef
1254 lp_build_fract(struct lp_build_context *bld,
1255 LLVMValueRef a)
1256 {
1257 assert(bld->type.floating);
1258 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1259 }
1260
1261
1262 /**
1263 * Return the integer part of a float (vector) value (== round toward zero).
1264 * The returned value is an integer (vector).
1265 * Ex: itrunc(-1.5) = -1
1266 */
1267 LLVMValueRef
1268 lp_build_itrunc(struct lp_build_context *bld,
1269 LLVMValueRef a)
1270 {
1271 LLVMBuilderRef builder = bld->gallivm->builder;
1272 const struct lp_type type = bld->type;
1273 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1274
1275 assert(type.floating);
1276 assert(lp_check_value(type, a));
1277
1278 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1279 }
1280
1281
1282 /**
1283 * Return float (vector) rounded to nearest integer (vector). The returned
1284 * value is an integer (vector).
1285 * Ex: iround(0.9) = 1
1286 * Ex: iround(-1.5) = -2
1287 */
1288 LLVMValueRef
1289 lp_build_iround(struct lp_build_context *bld,
1290 LLVMValueRef a)
1291 {
1292 LLVMBuilderRef builder = bld->gallivm->builder;
1293 const struct lp_type type = bld->type;
1294 LLVMTypeRef int_vec_type = bld->int_vec_type;
1295 LLVMValueRef res;
1296
1297 assert(type.floating);
1298
1299 assert(lp_check_value(type, a));
1300
1301 if (util_cpu_caps.has_sse2 &&
1302 ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1303 return lp_build_iround_nearest_sse2(bld, a);
1304 }
1305 else if (util_cpu_caps.has_sse4_1 &&
1306 (type.length == 1 || type.width*type.length == 128)) {
1307 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1308 }
1309 else {
1310 LLVMValueRef half;
1311
1312 half = lp_build_const_vec(bld->gallivm, type, 0.5);
1313
1314 if (type.sign) {
1315 LLVMTypeRef vec_type = bld->vec_type;
1316 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1317 (unsigned long long)1 << (type.width - 1));
1318 LLVMValueRef sign;
1319
1320 /* get sign bit */
1321 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1322 sign = LLVMBuildAnd(builder, sign, mask, "");
1323
1324 /* sign * 0.5 */
1325 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1326 half = LLVMBuildOr(builder, sign, half, "");
1327 half = LLVMBuildBitCast(builder, half, vec_type, "");
1328 }
1329
1330 res = LLVMBuildFAdd(builder, a, half, "");
1331 }
1332
1333 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1334
1335 return res;
1336 }
1337
1338
1339 /**
1340 * Return floor of float (vector), result is an int (vector)
1341 * Ex: ifloor(1.1) = 1.0
1342 * Ex: ifloor(-1.1) = -2.0
1343 */
1344 LLVMValueRef
1345 lp_build_ifloor(struct lp_build_context *bld,
1346 LLVMValueRef a)
1347 {
1348 LLVMBuilderRef builder = bld->gallivm->builder;
1349 const struct lp_type type = bld->type;
1350 LLVMTypeRef int_vec_type = bld->int_vec_type;
1351 LLVMValueRef res;
1352
1353 assert(type.floating);
1354 assert(lp_check_value(type, a));
1355
1356 if (util_cpu_caps.has_sse4_1 &&
1357 (type.length == 1 || type.width*type.length == 128)) {
1358 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1359 }
1360 else {
1361 res = a;
1362
1363 if (type.sign) {
1364 /* Take the sign bit and add it to 1 constant */
1365 LLVMTypeRef vec_type = bld->vec_type;
1366 unsigned mantissa = lp_mantissa(type);
1367 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1368 (unsigned long long)1 << (type.width - 1));
1369 LLVMValueRef sign;
1370 LLVMValueRef offset;
1371
1372 /* sign = a < 0 ? ~0 : 0 */
1373 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1374 sign = LLVMBuildAnd(builder, sign, mask, "");
1375 sign = LLVMBuildAShr(builder, sign,
1376 lp_build_const_int_vec(bld->gallivm, type,
1377 type.width - 1),
1378 "ifloor.sign");
1379
1380 /* offset = -0.99999(9)f */
1381 offset = lp_build_const_vec(bld->gallivm, type,
1382 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1383 offset = LLVMConstBitCast(offset, int_vec_type);
1384
1385 /* offset = a < 0 ? offset : 0.0f */
1386 offset = LLVMBuildAnd(builder, offset, sign, "");
1387 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1388
1389 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1390 }
1391 }
1392
1393 /* round to nearest (toward zero) */
1394 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1395
1396 return res;
1397 }
1398
1399
1400 /**
1401 * Return ceiling of float (vector), returning int (vector).
1402 * Ex: iceil( 1.1) = 2
1403 * Ex: iceil(-1.1) = -1
1404 */
1405 LLVMValueRef
1406 lp_build_iceil(struct lp_build_context *bld,
1407 LLVMValueRef a)
1408 {
1409 LLVMBuilderRef builder = bld->gallivm->builder;
1410 const struct lp_type type = bld->type;
1411 LLVMTypeRef int_vec_type = bld->int_vec_type;
1412 LLVMValueRef res;
1413
1414 assert(type.floating);
1415 assert(lp_check_value(type, a));
1416
1417 if (util_cpu_caps.has_sse4_1 &&
1418 (type.length == 1 || type.width*type.length == 128)) {
1419 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1420 }
1421 else {
1422 LLVMTypeRef vec_type = bld->vec_type;
1423 unsigned mantissa = lp_mantissa(type);
1424 LLVMValueRef offset;
1425
1426 /* offset = 0.99999(9)f */
1427 offset = lp_build_const_vec(bld->gallivm, type,
1428 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1429
1430 if (type.sign) {
1431 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1432 (unsigned long long)1 << (type.width - 1));
1433 LLVMValueRef sign;
1434
1435 /* sign = a < 0 ? 0 : ~0 */
1436 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1437 sign = LLVMBuildAnd(builder, sign, mask, "");
1438 sign = LLVMBuildAShr(builder, sign,
1439 lp_build_const_int_vec(bld->gallivm, type,
1440 type.width - 1),
1441 "iceil.sign");
1442 sign = LLVMBuildNot(builder, sign, "iceil.not");
1443
1444 /* offset = a < 0 ? 0.0 : offset */
1445 offset = LLVMConstBitCast(offset, int_vec_type);
1446 offset = LLVMBuildAnd(builder, offset, sign, "");
1447 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1448 }
1449
1450 res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1451 }
1452
1453 /* round to nearest (toward zero) */
1454 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1455
1456 return res;
1457 }
1458
1459
1460 /**
1461 * Combined ifloor() & fract().
1462 *
1463 * Preferred to calling the functions separately, as it will ensure that the
1464 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1465 */
1466 void
1467 lp_build_ifloor_fract(struct lp_build_context *bld,
1468 LLVMValueRef a,
1469 LLVMValueRef *out_ipart,
1470 LLVMValueRef *out_fpart)
1471 {
1472 LLVMBuilderRef builder = bld->gallivm->builder;
1473 const struct lp_type type = bld->type;
1474 LLVMValueRef ipart;
1475
1476 assert(type.floating);
1477 assert(lp_check_value(type, a));
1478
1479 if (util_cpu_caps.has_sse4_1 &&
1480 (type.length == 1 || type.width*type.length == 128)) {
1481 /*
1482 * floor() is easier.
1483 */
1484
1485 ipart = lp_build_floor(bld, a);
1486 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1487 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1488 }
1489 else {
1490 /*
1491 * ifloor() is easier.
1492 */
1493
1494 *out_ipart = lp_build_ifloor(bld, a);
1495 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1496 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1497 }
1498 }
1499
1500
1501 LLVMValueRef
1502 lp_build_sqrt(struct lp_build_context *bld,
1503 LLVMValueRef a)
1504 {
1505 LLVMBuilderRef builder = bld->gallivm->builder;
1506 const struct lp_type type = bld->type;
1507 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1508 char intrinsic[32];
1509
1510 assert(lp_check_value(type, a));
1511
1512 /* TODO: optimize the constant case */
1513 /* TODO: optimize the constant case */
1514
1515 assert(type.floating);
1516 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1517
1518 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1519 }
1520
1521
1522 /**
1523 * Do one Newton-Raphson step to improve reciprocate precision:
1524 *
1525 * x_{i+1} = x_i * (2 - a * x_i)
1526 *
1527 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1528 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1529 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1530 * halo. It would be necessary to clamp the argument to prevent this.
1531 *
1532 * See also:
1533 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1534 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1535 */
1536 static INLINE LLVMValueRef
1537 lp_build_rcp_refine(struct lp_build_context *bld,
1538 LLVMValueRef a,
1539 LLVMValueRef rcp_a)
1540 {
1541 LLVMBuilderRef builder = bld->gallivm->builder;
1542 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1543 LLVMValueRef res;
1544
1545 res = LLVMBuildFMul(builder, a, rcp_a, "");
1546 res = LLVMBuildFSub(builder, two, res, "");
1547 res = LLVMBuildFMul(builder, rcp_a, res, "");
1548
1549 return res;
1550 }
1551
1552
1553 LLVMValueRef
1554 lp_build_rcp(struct lp_build_context *bld,
1555 LLVMValueRef a)
1556 {
1557 LLVMBuilderRef builder = bld->gallivm->builder;
1558 const struct lp_type type = bld->type;
1559
1560 assert(lp_check_value(type, a));
1561
1562 if(a == bld->zero)
1563 return bld->undef;
1564 if(a == bld->one)
1565 return bld->one;
1566 if(a == bld->undef)
1567 return bld->undef;
1568
1569 assert(type.floating);
1570
1571 if(LLVMIsConstant(a))
1572 return LLVMConstFDiv(bld->one, a);
1573
1574 /*
1575 * We don't use RCPPS because:
1576 * - it only has 10bits of precision
1577 * - it doesn't even get the reciprocate of 1.0 exactly
1578 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1579 * - for recent processors the benefit over DIVPS is marginal, a case
1580 * depedent
1581 *
1582 * We could still use it on certain processors if benchmarks show that the
1583 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1584 * particular uses that require less workarounds.
1585 */
1586
1587 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1588 const unsigned num_iterations = 0;
1589 LLVMValueRef res;
1590 unsigned i;
1591
1592 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1593
1594 for (i = 0; i < num_iterations; ++i) {
1595 res = lp_build_rcp_refine(bld, a, res);
1596 }
1597
1598 return res;
1599 }
1600
1601 return LLVMBuildFDiv(builder, bld->one, a, "");
1602 }
1603
1604
1605 /**
1606 * Do one Newton-Raphson step to improve rsqrt precision:
1607 *
1608 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1609 *
1610 * See also:
1611 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1612 */
1613 static INLINE LLVMValueRef
1614 lp_build_rsqrt_refine(struct lp_build_context *bld,
1615 LLVMValueRef a,
1616 LLVMValueRef rsqrt_a)
1617 {
1618 LLVMBuilderRef builder = bld->gallivm->builder;
1619 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1620 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1621 LLVMValueRef res;
1622
1623 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1624 res = LLVMBuildFMul(builder, a, res, "");
1625 res = LLVMBuildFSub(builder, three, res, "");
1626 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1627 res = LLVMBuildFMul(builder, half, res, "");
1628
1629 return res;
1630 }
1631
1632
1633 /**
1634 * Generate 1/sqrt(a)
1635 */
1636 LLVMValueRef
1637 lp_build_rsqrt(struct lp_build_context *bld,
1638 LLVMValueRef a)
1639 {
1640 LLVMBuilderRef builder = bld->gallivm->builder;
1641 const struct lp_type type = bld->type;
1642
1643 assert(lp_check_value(type, a));
1644
1645 assert(type.floating);
1646
1647 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1648 const unsigned num_iterations = 1;
1649 LLVMValueRef res;
1650 unsigned i;
1651
1652 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1653
1654 for (i = 0; i < num_iterations; ++i) {
1655 res = lp_build_rsqrt_refine(bld, a, res);
1656 }
1657
1658 return res;
1659 }
1660
1661 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1662 }
1663
1664
1665 /**
1666 * Generate sin(a) using SSE2
1667 */
1668 LLVMValueRef
1669 lp_build_sin(struct lp_build_context *bld,
1670 LLVMValueRef a)
1671 {
1672 struct gallivm_state *gallivm = bld->gallivm;
1673 LLVMBuilderRef builder = gallivm->builder;
1674 struct lp_type int_type = lp_int_type(bld->type);
1675 LLVMBuilderRef b = builder;
1676
1677 /*
1678 * take the absolute value,
1679 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1680 */
1681
1682 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1683 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1684
1685 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1686 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1687
1688 /*
1689 * extract the sign bit (upper one)
1690 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1691 */
1692 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
1693 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1694
1695 /*
1696 * scale by 4/Pi
1697 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1698 */
1699
1700 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1701 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1702
1703 /*
1704 * store the integer part of y in mm0
1705 * emm2 = _mm_cvttps_epi32(y);
1706 */
1707
1708 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1709
1710 /*
1711 * j=(j+1) & (~1) (see the cephes sources)
1712 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1713 */
1714
1715 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1716 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1717 /*
1718 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1719 */
1720 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1721 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1722
1723 /*
1724 * y = _mm_cvtepi32_ps(emm2);
1725 */
1726 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1727
1728 /* get the swap sign flag
1729 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1730 */
1731 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1732 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1733
1734 /*
1735 * emm2 = _mm_slli_epi32(emm0, 29);
1736 */
1737 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1738 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1739
1740 /*
1741 * get the polynom selection mask
1742 * there is one polynom for 0 <= x <= Pi/4
1743 * and another one for Pi/4<x<=Pi/2
1744 * Both branches will be computed.
1745 *
1746 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1747 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1748 */
1749
1750 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1751 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1752 LLVMValueRef poly_mask = lp_build_compare(gallivm,
1753 int_type, PIPE_FUNC_EQUAL,
1754 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1755 /*
1756 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1757 */
1758 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1759
1760 /*
1761 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1762 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1763 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1764 */
1765 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1766 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1767 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1768
1769 /*
1770 * The magic pass: "Extended precision modular arithmetic"
1771 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1772 * xmm1 = _mm_mul_ps(y, xmm1);
1773 * xmm2 = _mm_mul_ps(y, xmm2);
1774 * xmm3 = _mm_mul_ps(y, xmm3);
1775 */
1776 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1777 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1778 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1779
1780 /*
1781 * x = _mm_add_ps(x, xmm1);
1782 * x = _mm_add_ps(x, xmm2);
1783 * x = _mm_add_ps(x, xmm3);
1784 */
1785
1786 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1787 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1788 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1789
1790 /*
1791 * Evaluate the first polynom (0 <= x <= Pi/4)
1792 *
1793 * z = _mm_mul_ps(x,x);
1794 */
1795 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1796
1797 /*
1798 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1799 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1800 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1801 */
1802 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
1803 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
1804 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
1805
1806 /*
1807 * y = *(v4sf*)_ps_coscof_p0;
1808 * y = _mm_mul_ps(y, z);
1809 */
1810 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1811 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1812 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1813 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1814 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1815 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1816
1817
1818 /*
1819 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1820 * y = _mm_sub_ps(y, tmp);
1821 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1822 */
1823 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
1824 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1825 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1826 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
1827 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1828
1829 /*
1830 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1831 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1832 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1833 */
1834 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
1835 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
1836 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
1837
1838 /*
1839 * Evaluate the second polynom (Pi/4 <= x <= 0)
1840 *
1841 * y2 = *(v4sf*)_ps_sincof_p0;
1842 * y2 = _mm_mul_ps(y2, z);
1843 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1844 * y2 = _mm_mul_ps(y2, z);
1845 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1846 * y2 = _mm_mul_ps(y2, z);
1847 * y2 = _mm_mul_ps(y2, x);
1848 * y2 = _mm_add_ps(y2, x);
1849 */
1850
1851 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1852 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1853 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1854 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1855 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1856 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1857 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1858
1859 /*
1860 * select the correct result from the two polynoms
1861 * xmm3 = poly_mask;
1862 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1863 * y = _mm_andnot_ps(xmm3, y);
1864 * y = _mm_add_ps(y,y2);
1865 */
1866 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
1867 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
1868 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1869 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1870 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1871 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1872 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1873
1874 /*
1875 * update the sign
1876 * y = _mm_xor_ps(y, sign_bit);
1877 */
1878 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1879 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
1880 return y_result;
1881 }
1882
1883
1884 /**
1885 * Generate cos(a) using SSE2
1886 */
1887 LLVMValueRef
1888 lp_build_cos(struct lp_build_context *bld,
1889 LLVMValueRef a)
1890 {
1891 struct gallivm_state *gallivm = bld->gallivm;
1892 LLVMBuilderRef builder = gallivm->builder;
1893 struct lp_type int_type = lp_int_type(bld->type);
1894 LLVMBuilderRef b = builder;
1895
1896 /*
1897 * take the absolute value,
1898 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1899 */
1900
1901 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1902 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1903
1904 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1905 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1906
1907 /*
1908 * scale by 4/Pi
1909 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1910 */
1911
1912 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1913 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1914
1915 /*
1916 * store the integer part of y in mm0
1917 * emm2 = _mm_cvttps_epi32(y);
1918 */
1919
1920 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1921
1922 /*
1923 * j=(j+1) & (~1) (see the cephes sources)
1924 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1925 */
1926
1927 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1928 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1929 /*
1930 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1931 */
1932 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1933 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1934
1935 /*
1936 * y = _mm_cvtepi32_ps(emm2);
1937 */
1938 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1939
1940
1941 /*
1942 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1943 */
1944 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1945 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1946
1947
1948 /* get the swap sign flag
1949 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1950 */
1951 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1952 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1953 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1954 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1955
1956 /*
1957 * emm2 = _mm_slli_epi32(emm0, 29);
1958 */
1959 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1960 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1961
1962 /*
1963 * get the polynom selection mask
1964 * there is one polynom for 0 <= x <= Pi/4
1965 * and another one for Pi/4<x<=Pi/2
1966 * Both branches will be computed.
1967 *
1968 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1969 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1970 */
1971
1972 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1973 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1974 LLVMValueRef poly_mask = lp_build_compare(gallivm,
1975 int_type, PIPE_FUNC_EQUAL,
1976 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1977
1978 /*
1979 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1980 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1981 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1982 */
1983 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1984 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1985 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1986
1987 /*
1988 * The magic pass: "Extended precision modular arithmetic"
1989 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1990 * xmm1 = _mm_mul_ps(y, xmm1);
1991 * xmm2 = _mm_mul_ps(y, xmm2);
1992 * xmm3 = _mm_mul_ps(y, xmm3);
1993 */
1994 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1995 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1996 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1997
1998 /*
1999 * x = _mm_add_ps(x, xmm1);
2000 * x = _mm_add_ps(x, xmm2);
2001 * x = _mm_add_ps(x, xmm3);
2002 */
2003
2004 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2005 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2006 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2007
2008 /*
2009 * Evaluate the first polynom (0 <= x <= Pi/4)
2010 *
2011 * z = _mm_mul_ps(x,x);
2012 */
2013 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2014
2015 /*
2016 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2017 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2018 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2019 */
2020 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2021 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2022 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2023
2024 /*
2025 * y = *(v4sf*)_ps_coscof_p0;
2026 * y = _mm_mul_ps(y, z);
2027 */
2028 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2029 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2030 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2031 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2032 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2033 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2034
2035
2036 /*
2037 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2038 * y = _mm_sub_ps(y, tmp);
2039 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2040 */
2041 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2042 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2043 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2044 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2045 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2046
2047 /*
2048 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2049 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2050 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2051 */
2052 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2053 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2054 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2055
2056 /*
2057 * Evaluate the second polynom (Pi/4 <= x <= 0)
2058 *
2059 * y2 = *(v4sf*)_ps_sincof_p0;
2060 * y2 = _mm_mul_ps(y2, z);
2061 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2062 * y2 = _mm_mul_ps(y2, z);
2063 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2064 * y2 = _mm_mul_ps(y2, z);
2065 * y2 = _mm_mul_ps(y2, x);
2066 * y2 = _mm_add_ps(y2, x);
2067 */
2068
2069 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2070 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2071 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2072 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2073 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2074 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2075 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2076
2077 /*
2078 * select the correct result from the two polynoms
2079 * xmm3 = poly_mask;
2080 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2081 * y = _mm_andnot_ps(xmm3, y);
2082 * y = _mm_add_ps(y,y2);
2083 */
2084 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2085 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2086 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2087 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2088 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2089 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2090
2091 /*
2092 * update the sign
2093 * y = _mm_xor_ps(y, sign_bit);
2094 */
2095 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2096 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2097 return y_result;
2098 }
2099
2100
2101 /**
2102 * Generate pow(x, y)
2103 */
2104 LLVMValueRef
2105 lp_build_pow(struct lp_build_context *bld,
2106 LLVMValueRef x,
2107 LLVMValueRef y)
2108 {
2109 /* TODO: optimize the constant case */
2110 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2111 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2112 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2113 __FUNCTION__);
2114 }
2115
2116 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2117 }
2118
2119
2120 /**
2121 * Generate exp(x)
2122 */
2123 LLVMValueRef
2124 lp_build_exp(struct lp_build_context *bld,
2125 LLVMValueRef x)
2126 {
2127 /* log2(e) = 1/log(2) */
2128 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2129 1.4426950408889634);
2130
2131 assert(lp_check_value(bld->type, x));
2132
2133 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2134 }
2135
2136
2137 /**
2138 * Generate log(x)
2139 */
2140 LLVMValueRef
2141 lp_build_log(struct lp_build_context *bld,
2142 LLVMValueRef x)
2143 {
2144 /* log(2) */
2145 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2146 0.69314718055994529);
2147
2148 assert(lp_check_value(bld->type, x));
2149
2150 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2151 }
2152
2153
2154 /**
2155 * Generate polynomial.
2156 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2157 */
2158 static LLVMValueRef
2159 lp_build_polynomial(struct lp_build_context *bld,
2160 LLVMValueRef x,
2161 const double *coeffs,
2162 unsigned num_coeffs)
2163 {
2164 const struct lp_type type = bld->type;
2165 LLVMValueRef res = NULL;
2166 unsigned i;
2167
2168 assert(lp_check_value(bld->type, x));
2169
2170 /* TODO: optimize the constant case */
2171 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2172 LLVMIsConstant(x)) {
2173 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2174 __FUNCTION__);
2175 }
2176
2177 for (i = num_coeffs; i--; ) {
2178 LLVMValueRef coeff;
2179
2180 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2181
2182 if(res)
2183 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
2184 else
2185 res = coeff;
2186 }
2187
2188 if(res)
2189 return res;
2190 else
2191 return bld->undef;
2192 }
2193
2194
2195 /**
2196 * Minimax polynomial fit of 2**x, in range [0, 1[
2197 */
2198 const double lp_build_exp2_polynomial[] = {
2199 #if EXP_POLY_DEGREE == 5
2200 0.999999925063526176901,
2201 0.693153073200168932794,
2202 0.240153617044375388211,
2203 0.0558263180532956664775,
2204 0.00898934009049466391101,
2205 0.00187757667519147912699
2206 #elif EXP_POLY_DEGREE == 4
2207 1.00000259337069434683,
2208 0.693003834469974940458,
2209 0.24144275689150793076,
2210 0.0520114606103070150235,
2211 0.0135341679161270268764
2212 #elif EXP_POLY_DEGREE == 3
2213 0.999925218562710312959,
2214 0.695833540494823811697,
2215 0.226067155427249155588,
2216 0.0780245226406372992967
2217 #elif EXP_POLY_DEGREE == 2
2218 1.00172476321474503578,
2219 0.657636275736077639316,
2220 0.33718943461968720704
2221 #else
2222 #error
2223 #endif
2224 };
2225
2226
2227 void
2228 lp_build_exp2_approx(struct lp_build_context *bld,
2229 LLVMValueRef x,
2230 LLVMValueRef *p_exp2_int_part,
2231 LLVMValueRef *p_frac_part,
2232 LLVMValueRef *p_exp2)
2233 {
2234 LLVMBuilderRef builder = bld->gallivm->builder;
2235 const struct lp_type type = bld->type;
2236 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2237 LLVMValueRef ipart = NULL;
2238 LLVMValueRef fpart = NULL;
2239 LLVMValueRef expipart = NULL;
2240 LLVMValueRef expfpart = NULL;
2241 LLVMValueRef res = NULL;
2242
2243 assert(lp_check_value(bld->type, x));
2244
2245 if(p_exp2_int_part || p_frac_part || p_exp2) {
2246 /* TODO: optimize the constant case */
2247 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2248 LLVMIsConstant(x)) {
2249 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2250 __FUNCTION__);
2251 }
2252
2253 assert(type.floating && type.width == 32);
2254
2255 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0));
2256 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2257
2258 /* ipart = floor(x) */
2259 /* fpart = x - ipart */
2260 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2261 }
2262
2263 if(p_exp2_int_part || p_exp2) {
2264 /* expipart = (float) (1 << ipart) */
2265 expipart = LLVMBuildAdd(builder, ipart,
2266 lp_build_const_int_vec(bld->gallivm, type, 127), "");
2267 expipart = LLVMBuildShl(builder, expipart,
2268 lp_build_const_int_vec(bld->gallivm, type, 23), "");
2269 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2270 }
2271
2272 if(p_exp2) {
2273 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2274 Elements(lp_build_exp2_polynomial));
2275
2276 res = LLVMBuildFMul(builder, expipart, expfpart, "");
2277 }
2278
2279 if(p_exp2_int_part)
2280 *p_exp2_int_part = expipart;
2281
2282 if(p_frac_part)
2283 *p_frac_part = fpart;
2284
2285 if(p_exp2)
2286 *p_exp2 = res;
2287 }
2288
2289
2290 LLVMValueRef
2291 lp_build_exp2(struct lp_build_context *bld,
2292 LLVMValueRef x)
2293 {
2294 LLVMValueRef res;
2295 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2296 return res;
2297 }
2298
2299
2300 /**
2301 * Extract the exponent of a IEEE-754 floating point value.
2302 *
2303 * Optionally apply an integer bias.
2304 *
2305 * Result is an integer value with
2306 *
2307 * ifloor(log2(x)) + bias
2308 */
2309 LLVMValueRef
2310 lp_build_extract_exponent(struct lp_build_context *bld,
2311 LLVMValueRef x,
2312 int bias)
2313 {
2314 LLVMBuilderRef builder = bld->gallivm->builder;
2315 const struct lp_type type = bld->type;
2316 unsigned mantissa = lp_mantissa(type);
2317 LLVMValueRef res;
2318
2319 assert(type.floating);
2320
2321 assert(lp_check_value(bld->type, x));
2322
2323 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2324
2325 res = LLVMBuildLShr(builder, x,
2326 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2327 res = LLVMBuildAnd(builder, res,
2328 lp_build_const_int_vec(bld->gallivm, type, 255), "");
2329 res = LLVMBuildSub(builder, res,
2330 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2331
2332 return res;
2333 }
2334
2335
2336 /**
2337 * Extract the mantissa of the a floating.
2338 *
2339 * Result is a floating point value with
2340 *
2341 * x / floor(log2(x))
2342 */
2343 LLVMValueRef
2344 lp_build_extract_mantissa(struct lp_build_context *bld,
2345 LLVMValueRef x)
2346 {
2347 LLVMBuilderRef builder = bld->gallivm->builder;
2348 const struct lp_type type = bld->type;
2349 unsigned mantissa = lp_mantissa(type);
2350 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2351 (1ULL << mantissa) - 1);
2352 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2353 LLVMValueRef res;
2354
2355 assert(lp_check_value(bld->type, x));
2356
2357 assert(type.floating);
2358
2359 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2360
2361 /* res = x / 2**ipart */
2362 res = LLVMBuildAnd(builder, x, mantmask, "");
2363 res = LLVMBuildOr(builder, res, one, "");
2364 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2365
2366 return res;
2367 }
2368
2369
2370
2371 /**
2372 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
2373 * These coefficients can be generate with
2374 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2375 */
2376 const double lp_build_log2_polynomial[] = {
2377 #if LOG_POLY_DEGREE == 6
2378 3.11578814719469302614,
2379 -3.32419399085241980044,
2380 2.59883907202499966007,
2381 -1.23152682416275988241,
2382 0.318212422185251071475,
2383 -0.0344359067839062357313
2384 #elif LOG_POLY_DEGREE == 5
2385 2.8882704548164776201,
2386 -2.52074962577807006663,
2387 1.48116647521213171641,
2388 -0.465725644288844778798,
2389 0.0596515482674574969533
2390 #elif LOG_POLY_DEGREE == 4
2391 2.61761038894603480148,
2392 -1.75647175389045657003,
2393 0.688243882994381274313,
2394 -0.107254423828329604454
2395 #elif LOG_POLY_DEGREE == 3
2396 2.28330284476918490682,
2397 -1.04913055217340124191,
2398 0.204446009836232697516
2399 #else
2400 #error
2401 #endif
2402 };
2403
2404
2405 /**
2406 * See http://www.devmaster.net/forums/showthread.php?p=43580
2407 */
2408 void
2409 lp_build_log2_approx(struct lp_build_context *bld,
2410 LLVMValueRef x,
2411 LLVMValueRef *p_exp,
2412 LLVMValueRef *p_floor_log2,
2413 LLVMValueRef *p_log2)
2414 {
2415 LLVMBuilderRef builder = bld->gallivm->builder;
2416 const struct lp_type type = bld->type;
2417 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2418 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2419
2420 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2421 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2422 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2423
2424 LLVMValueRef i = NULL;
2425 LLVMValueRef exp = NULL;
2426 LLVMValueRef mant = NULL;
2427 LLVMValueRef logexp = NULL;
2428 LLVMValueRef logmant = NULL;
2429 LLVMValueRef res = NULL;
2430
2431 assert(lp_check_value(bld->type, x));
2432
2433 if(p_exp || p_floor_log2 || p_log2) {
2434 /* TODO: optimize the constant case */
2435 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2436 LLVMIsConstant(x)) {
2437 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2438 __FUNCTION__);
2439 }
2440
2441 assert(type.floating && type.width == 32);
2442
2443 /*
2444 * We don't explicitly handle denormalized numbers. They will yield a
2445 * result in the neighbourhood of -127, which appears to be adequate
2446 * enough.
2447 */
2448
2449 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2450
2451 /* exp = (float) exponent(x) */
2452 exp = LLVMBuildAnd(builder, i, expmask, "");
2453 }
2454
2455 if(p_floor_log2 || p_log2) {
2456 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2457 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2458 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2459 }
2460
2461 if(p_log2) {
2462 /* mant = (float) mantissa(x) */
2463 mant = LLVMBuildAnd(builder, i, mantmask, "");
2464 mant = LLVMBuildOr(builder, mant, one, "");
2465 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2466
2467 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
2468 Elements(lp_build_log2_polynomial));
2469
2470 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
2471 logmant = LLVMBuildFMul(builder, logmant, LLVMBuildFSub(builder, mant, bld->one, ""), "");
2472
2473 res = LLVMBuildFAdd(builder, logmant, logexp, "");
2474 }
2475
2476 if(p_exp) {
2477 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2478 *p_exp = exp;
2479 }
2480
2481 if(p_floor_log2)
2482 *p_floor_log2 = logexp;
2483
2484 if(p_log2)
2485 *p_log2 = res;
2486 }
2487
2488
2489 LLVMValueRef
2490 lp_build_log2(struct lp_build_context *bld,
2491 LLVMValueRef x)
2492 {
2493 LLVMValueRef res;
2494 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2495 return res;
2496 }
2497
2498
2499 /**
2500 * Faster (and less accurate) log2.
2501 *
2502 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2503 *
2504 * Piece-wise linear approximation, with exact results when x is a
2505 * power of two.
2506 *
2507 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2508 */
2509 LLVMValueRef
2510 lp_build_fast_log2(struct lp_build_context *bld,
2511 LLVMValueRef x)
2512 {
2513 LLVMBuilderRef builder = bld->gallivm->builder;
2514 LLVMValueRef ipart;
2515 LLVMValueRef fpart;
2516
2517 assert(lp_check_value(bld->type, x));
2518
2519 assert(bld->type.floating);
2520
2521 /* ipart = floor(log2(x)) - 1 */
2522 ipart = lp_build_extract_exponent(bld, x, -1);
2523 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2524
2525 /* fpart = x / 2**ipart */
2526 fpart = lp_build_extract_mantissa(bld, x);
2527
2528 /* ipart + fpart */
2529 return LLVMBuildFAdd(builder, ipart, fpart, "");
2530 }
2531
2532
2533 /**
2534 * Fast implementation of iround(log2(x)).
2535 *
2536 * Not an approximation -- it should give accurate results all the time.
2537 */
2538 LLVMValueRef
2539 lp_build_ilog2(struct lp_build_context *bld,
2540 LLVMValueRef x)
2541 {
2542 LLVMBuilderRef builder = bld->gallivm->builder;
2543 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2544 LLVMValueRef ipart;
2545
2546 assert(bld->type.floating);
2547
2548 assert(lp_check_value(bld->type, x));
2549
2550 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
2551 x = LLVMBuildFMul(builder, x, sqrt2, "");
2552
2553 /* ipart = floor(log2(x) + 0.5) */
2554 ipart = lp_build_extract_exponent(bld, x, 0);
2555
2556 return ipart;
2557 }