Merge branch 'gallium-userbuf'
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_math.h"
51 #include "util/u_string.h"
52 #include "util/u_cpu_detect.h"
53
54 #include "lp_bld_type.h"
55 #include "lp_bld_const.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_intr.h"
58 #include "lp_bld_logic.h"
59 #include "lp_bld_pack.h"
60 #include "lp_bld_debug.h"
61 #include "lp_bld_arit.h"
62
63
64 #define EXP_POLY_DEGREE 5
65
66 #define LOG_POLY_DEGREE 4
67
68
69 /**
70 * Generate min(a, b)
71 * No checks for special case values of a or b = 1 or 0 are done.
72 */
73 static LLVMValueRef
74 lp_build_min_simple(struct lp_build_context *bld,
75 LLVMValueRef a,
76 LLVMValueRef b)
77 {
78 LLVMBuilderRef builder = bld->gallivm->builder;
79 const struct lp_type type = bld->type;
80 const char *intrinsic = NULL;
81 LLVMValueRef cond;
82
83 assert(lp_check_value(type, a));
84 assert(lp_check_value(type, b));
85
86 /* TODO: optimize the constant case */
87
88 if(type.width * type.length == 128) {
89 if(type.floating) {
90 if(type.width == 32 && util_cpu_caps.has_sse)
91 intrinsic = "llvm.x86.sse.min.ps";
92 if(type.width == 64 && util_cpu_caps.has_sse2)
93 intrinsic = "llvm.x86.sse2.min.pd";
94 }
95 else {
96 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
97 intrinsic = "llvm.x86.sse2.pminu.b";
98 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
99 intrinsic = "llvm.x86.sse41.pminsb";
100 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
101 intrinsic = "llvm.x86.sse41.pminuw";
102 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
103 intrinsic = "llvm.x86.sse2.pmins.w";
104 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
105 intrinsic = "llvm.x86.sse41.pminud";
106 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
107 intrinsic = "llvm.x86.sse41.pminsd";
108 }
109 }
110
111 if(intrinsic)
112 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
113
114 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
115 return lp_build_select(bld, cond, a, b);
116 }
117
118
119 /**
120 * Generate max(a, b)
121 * No checks for special case values of a or b = 1 or 0 are done.
122 */
123 static LLVMValueRef
124 lp_build_max_simple(struct lp_build_context *bld,
125 LLVMValueRef a,
126 LLVMValueRef b)
127 {
128 LLVMBuilderRef builder = bld->gallivm->builder;
129 const struct lp_type type = bld->type;
130 const char *intrinsic = NULL;
131 LLVMValueRef cond;
132
133 assert(lp_check_value(type, a));
134 assert(lp_check_value(type, b));
135
136 /* TODO: optimize the constant case */
137
138 if(type.width * type.length == 128) {
139 if(type.floating) {
140 if(type.width == 32 && util_cpu_caps.has_sse)
141 intrinsic = "llvm.x86.sse.max.ps";
142 if(type.width == 64 && util_cpu_caps.has_sse2)
143 intrinsic = "llvm.x86.sse2.max.pd";
144 }
145 else {
146 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
147 intrinsic = "llvm.x86.sse2.pmaxu.b";
148 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
149 intrinsic = "llvm.x86.sse41.pmaxsb";
150 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
151 intrinsic = "llvm.x86.sse41.pmaxuw";
152 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
153 intrinsic = "llvm.x86.sse2.pmaxs.w";
154 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
155 intrinsic = "llvm.x86.sse41.pmaxud";
156 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
157 intrinsic = "llvm.x86.sse41.pmaxsd";
158 }
159 }
160
161 if(intrinsic)
162 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
163
164 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
165 return lp_build_select(bld, cond, a, b);
166 }
167
168
169 /**
170 * Generate 1 - a, or ~a depending on bld->type.
171 */
172 LLVMValueRef
173 lp_build_comp(struct lp_build_context *bld,
174 LLVMValueRef a)
175 {
176 LLVMBuilderRef builder = bld->gallivm->builder;
177 const struct lp_type type = bld->type;
178
179 assert(lp_check_value(type, a));
180
181 if(a == bld->one)
182 return bld->zero;
183 if(a == bld->zero)
184 return bld->one;
185
186 if(type.norm && !type.floating && !type.fixed && !type.sign) {
187 if(LLVMIsConstant(a))
188 return LLVMConstNot(a);
189 else
190 return LLVMBuildNot(builder, a, "");
191 }
192
193 if(LLVMIsConstant(a))
194 if (type.floating)
195 return LLVMConstFSub(bld->one, a);
196 else
197 return LLVMConstSub(bld->one, a);
198 else
199 if (type.floating)
200 return LLVMBuildFSub(builder, bld->one, a, "");
201 else
202 return LLVMBuildSub(builder, bld->one, a, "");
203 }
204
205
206 /**
207 * Generate a + b
208 */
209 LLVMValueRef
210 lp_build_add(struct lp_build_context *bld,
211 LLVMValueRef a,
212 LLVMValueRef b)
213 {
214 LLVMBuilderRef builder = bld->gallivm->builder;
215 const struct lp_type type = bld->type;
216 LLVMValueRef res;
217
218 assert(lp_check_value(type, a));
219 assert(lp_check_value(type, b));
220
221 if(a == bld->zero)
222 return b;
223 if(b == bld->zero)
224 return a;
225 if(a == bld->undef || b == bld->undef)
226 return bld->undef;
227
228 if(bld->type.norm) {
229 const char *intrinsic = NULL;
230
231 if(a == bld->one || b == bld->one)
232 return bld->one;
233
234 if(util_cpu_caps.has_sse2 &&
235 type.width * type.length == 128 &&
236 !type.floating && !type.fixed) {
237 if(type.width == 8)
238 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
239 if(type.width == 16)
240 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
241 }
242
243 if(intrinsic)
244 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
245 }
246
247 if(LLVMIsConstant(a) && LLVMIsConstant(b))
248 if (type.floating)
249 res = LLVMConstFAdd(a, b);
250 else
251 res = LLVMConstAdd(a, b);
252 else
253 if (type.floating)
254 res = LLVMBuildFAdd(builder, a, b, "");
255 else
256 res = LLVMBuildAdd(builder, a, b, "");
257
258 /* clamp to ceiling of 1.0 */
259 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
260 res = lp_build_min_simple(bld, res, bld->one);
261
262 /* XXX clamp to floor of -1 or 0??? */
263
264 return res;
265 }
266
267
268 /** Return the scalar sum of the elements of a */
269 LLVMValueRef
270 lp_build_sum_vector(struct lp_build_context *bld,
271 LLVMValueRef a)
272 {
273 LLVMBuilderRef builder = bld->gallivm->builder;
274 const struct lp_type type = bld->type;
275 LLVMValueRef index, res;
276 unsigned i;
277
278 assert(lp_check_value(type, a));
279
280 if (type.length == 1) {
281 return a;
282 }
283
284 assert(!bld->type.norm);
285
286 index = lp_build_const_int32(bld->gallivm, 0);
287 res = LLVMBuildExtractElement(builder, a, index, "");
288
289 for (i = 1; i < type.length; i++) {
290 index = lp_build_const_int32(bld->gallivm, i);
291 if (type.floating)
292 res = LLVMBuildFAdd(builder, res,
293 LLVMBuildExtractElement(builder,
294 a, index, ""),
295 "");
296 else
297 res = LLVMBuildAdd(builder, res,
298 LLVMBuildExtractElement(builder,
299 a, index, ""),
300 "");
301 }
302
303 return res;
304 }
305
306
307 /**
308 * Generate a - b
309 */
310 LLVMValueRef
311 lp_build_sub(struct lp_build_context *bld,
312 LLVMValueRef a,
313 LLVMValueRef b)
314 {
315 LLVMBuilderRef builder = bld->gallivm->builder;
316 const struct lp_type type = bld->type;
317 LLVMValueRef res;
318
319 assert(lp_check_value(type, a));
320 assert(lp_check_value(type, b));
321
322 if(b == bld->zero)
323 return a;
324 if(a == bld->undef || b == bld->undef)
325 return bld->undef;
326 if(a == b)
327 return bld->zero;
328
329 if(bld->type.norm) {
330 const char *intrinsic = NULL;
331
332 if(b == bld->one)
333 return bld->zero;
334
335 if(util_cpu_caps.has_sse2 &&
336 type.width * type.length == 128 &&
337 !type.floating && !type.fixed) {
338 if(type.width == 8)
339 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
340 if(type.width == 16)
341 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
342 }
343
344 if(intrinsic)
345 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
346 }
347
348 if(LLVMIsConstant(a) && LLVMIsConstant(b))
349 if (type.floating)
350 res = LLVMConstFSub(a, b);
351 else
352 res = LLVMConstSub(a, b);
353 else
354 if (type.floating)
355 res = LLVMBuildFSub(builder, a, b, "");
356 else
357 res = LLVMBuildSub(builder, a, b, "");
358
359 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
360 res = lp_build_max_simple(bld, res, bld->zero);
361
362 return res;
363 }
364
365
366 /**
367 * Normalized 8bit multiplication.
368 *
369 * - alpha plus one
370 *
371 * makes the following approximation to the division (Sree)
372 *
373 * a*b/255 ~= (a*(b + 1)) >> 256
374 *
375 * which is the fastest method that satisfies the following OpenGL criteria
376 *
377 * 0*0 = 0 and 255*255 = 255
378 *
379 * - geometric series
380 *
381 * takes the geometric series approximation to the division
382 *
383 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
384 *
385 * in this case just the first two terms to fit in 16bit arithmetic
386 *
387 * t/255 ~= (t + (t >> 8)) >> 8
388 *
389 * note that just by itself it doesn't satisfies the OpenGL criteria, as
390 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
391 * must be used
392 *
393 * - geometric series plus rounding
394 *
395 * when using a geometric series division instead of truncating the result
396 * use roundoff in the approximation (Jim Blinn)
397 *
398 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
399 *
400 * achieving the exact results
401 *
402 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
403 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
404 * @sa Michael Herf, The "double blend trick", May 2000,
405 * http://www.stereopsis.com/doubleblend.html
406 */
407 static LLVMValueRef
408 lp_build_mul_u8n(struct gallivm_state *gallivm,
409 struct lp_type i16_type,
410 LLVMValueRef a, LLVMValueRef b)
411 {
412 LLVMBuilderRef builder = gallivm->builder;
413 LLVMValueRef c8;
414 LLVMValueRef ab;
415
416 assert(!i16_type.floating);
417 assert(lp_check_value(i16_type, a));
418 assert(lp_check_value(i16_type, b));
419
420 c8 = lp_build_const_int_vec(gallivm, i16_type, 8);
421
422 #if 0
423
424 /* a*b/255 ~= (a*(b + 1)) >> 256 */
425 b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, i16_type, 1), "");
426 ab = LLVMBuildMul(builder, a, b, "");
427
428 #else
429
430 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
431 ab = LLVMBuildMul(builder, a, b, "");
432 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
433 ab = LLVMBuildAdd(builder, ab, lp_build_const_int_vec(gallivm, i16_type, 0x80), "");
434
435 #endif
436
437 ab = LLVMBuildLShr(builder, ab, c8, "");
438
439 return ab;
440 }
441
442
443 /**
444 * Generate a * b
445 */
446 LLVMValueRef
447 lp_build_mul(struct lp_build_context *bld,
448 LLVMValueRef a,
449 LLVMValueRef b)
450 {
451 LLVMBuilderRef builder = bld->gallivm->builder;
452 const struct lp_type type = bld->type;
453 LLVMValueRef shift;
454 LLVMValueRef res;
455
456 assert(lp_check_value(type, a));
457 assert(lp_check_value(type, b));
458
459 if(a == bld->zero)
460 return bld->zero;
461 if(a == bld->one)
462 return b;
463 if(b == bld->zero)
464 return bld->zero;
465 if(b == bld->one)
466 return a;
467 if(a == bld->undef || b == bld->undef)
468 return bld->undef;
469
470 if(!type.floating && !type.fixed && type.norm) {
471 if(type.width == 8) {
472 struct lp_type i16_type = lp_wider_type(type);
473 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
474
475 lp_build_unpack2(bld->gallivm, type, i16_type, a, &al, &ah);
476 lp_build_unpack2(bld->gallivm, type, i16_type, b, &bl, &bh);
477
478 /* PMULLW, PSRLW, PADDW */
479 abl = lp_build_mul_u8n(bld->gallivm, i16_type, al, bl);
480 abh = lp_build_mul_u8n(bld->gallivm, i16_type, ah, bh);
481
482 ab = lp_build_pack2(bld->gallivm, i16_type, type, abl, abh);
483
484 return ab;
485 }
486
487 /* FIXME */
488 assert(0);
489 }
490
491 if(type.fixed)
492 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
493 else
494 shift = NULL;
495
496 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
497 if (type.floating)
498 res = LLVMConstFMul(a, b);
499 else
500 res = LLVMConstMul(a, b);
501 if(shift) {
502 if(type.sign)
503 res = LLVMConstAShr(res, shift);
504 else
505 res = LLVMConstLShr(res, shift);
506 }
507 }
508 else {
509 if (type.floating)
510 res = LLVMBuildFMul(builder, a, b, "");
511 else
512 res = LLVMBuildMul(builder, a, b, "");
513 if(shift) {
514 if(type.sign)
515 res = LLVMBuildAShr(builder, res, shift, "");
516 else
517 res = LLVMBuildLShr(builder, res, shift, "");
518 }
519 }
520
521 return res;
522 }
523
524
525 /**
526 * Small vector x scale multiplication optimization.
527 */
528 LLVMValueRef
529 lp_build_mul_imm(struct lp_build_context *bld,
530 LLVMValueRef a,
531 int b)
532 {
533 LLVMBuilderRef builder = bld->gallivm->builder;
534 LLVMValueRef factor;
535
536 assert(lp_check_value(bld->type, a));
537
538 if(b == 0)
539 return bld->zero;
540
541 if(b == 1)
542 return a;
543
544 if(b == -1)
545 return lp_build_negate(bld, a);
546
547 if(b == 2 && bld->type.floating)
548 return lp_build_add(bld, a, a);
549
550 if(util_is_power_of_two(b)) {
551 unsigned shift = ffs(b) - 1;
552
553 if(bld->type.floating) {
554 #if 0
555 /*
556 * Power of two multiplication by directly manipulating the mantissa.
557 *
558 * XXX: This might not be always faster, it will introduce a small error
559 * for multiplication by zero, and it will produce wrong results
560 * for Inf and NaN.
561 */
562 unsigned mantissa = lp_mantissa(bld->type);
563 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
564 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
565 a = LLVMBuildAdd(builder, a, factor, "");
566 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
567 return a;
568 #endif
569 }
570 else {
571 factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
572 return LLVMBuildShl(builder, a, factor, "");
573 }
574 }
575
576 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
577 return lp_build_mul(bld, a, factor);
578 }
579
580
581 /**
582 * Generate a / b
583 */
584 LLVMValueRef
585 lp_build_div(struct lp_build_context *bld,
586 LLVMValueRef a,
587 LLVMValueRef b)
588 {
589 LLVMBuilderRef builder = bld->gallivm->builder;
590 const struct lp_type type = bld->type;
591
592 assert(lp_check_value(type, a));
593 assert(lp_check_value(type, b));
594
595 if(a == bld->zero)
596 return bld->zero;
597 if(a == bld->one)
598 return lp_build_rcp(bld, b);
599 if(b == bld->zero)
600 return bld->undef;
601 if(b == bld->one)
602 return a;
603 if(a == bld->undef || b == bld->undef)
604 return bld->undef;
605
606 if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
607 if (type.floating)
608 return LLVMConstFDiv(a, b);
609 else if (type.sign)
610 return LLVMConstSDiv(a, b);
611 else
612 return LLVMConstUDiv(a, b);
613 }
614
615 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4 &&
616 type.floating)
617 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
618
619 if (type.floating)
620 return LLVMBuildFDiv(builder, a, b, "");
621 else if (type.sign)
622 return LLVMBuildSDiv(builder, a, b, "");
623 else
624 return LLVMBuildUDiv(builder, a, b, "");
625 }
626
627
628 /**
629 * Linear interpolation -- without any checks.
630 *
631 * @sa http://www.stereopsis.com/doubleblend.html
632 */
633 static INLINE LLVMValueRef
634 lp_build_lerp_simple(struct lp_build_context *bld,
635 LLVMValueRef x,
636 LLVMValueRef v0,
637 LLVMValueRef v1)
638 {
639 LLVMBuilderRef builder = bld->gallivm->builder;
640 LLVMValueRef delta;
641 LLVMValueRef res;
642
643 assert(lp_check_value(bld->type, x));
644 assert(lp_check_value(bld->type, v0));
645 assert(lp_check_value(bld->type, v1));
646
647 delta = lp_build_sub(bld, v1, v0);
648
649 res = lp_build_mul(bld, x, delta);
650
651 res = lp_build_add(bld, v0, res);
652
653 if (bld->type.fixed) {
654 /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
655 * but it will be wrong for other uses. Basically we need a more
656 * powerful lp_type, capable of further distinguishing the values
657 * interpretation from the value storage. */
658 res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << bld->type.width/2) - 1), "");
659 }
660
661 return res;
662 }
663
664
665 /**
666 * Linear interpolation.
667 */
668 LLVMValueRef
669 lp_build_lerp(struct lp_build_context *bld,
670 LLVMValueRef x,
671 LLVMValueRef v0,
672 LLVMValueRef v1)
673 {
674 LLVMBuilderRef builder = bld->gallivm->builder;
675 const struct lp_type type = bld->type;
676 LLVMValueRef res;
677
678 assert(lp_check_value(type, x));
679 assert(lp_check_value(type, v0));
680 assert(lp_check_value(type, v1));
681
682 if (type.norm) {
683 struct lp_type wide_type;
684 struct lp_build_context wide_bld;
685 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
686 LLVMValueRef shift;
687
688 assert(type.length >= 2);
689 assert(!type.sign);
690
691 /*
692 * Create a wider type, enough to hold the intermediate result of the
693 * multiplication.
694 */
695 memset(&wide_type, 0, sizeof wide_type);
696 wide_type.fixed = TRUE;
697 wide_type.width = type.width*2;
698 wide_type.length = type.length/2;
699
700 lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
701
702 lp_build_unpack2(bld->gallivm, type, wide_type, x, &xl, &xh);
703 lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
704 lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
705
706 /*
707 * Scale x from [0, 255] to [0, 256]
708 */
709
710 shift = lp_build_const_int_vec(bld->gallivm, wide_type, type.width - 1);
711
712 xl = lp_build_add(&wide_bld, xl,
713 LLVMBuildAShr(builder, xl, shift, ""));
714 xh = lp_build_add(&wide_bld, xh,
715 LLVMBuildAShr(builder, xh, shift, ""));
716
717 /*
718 * Lerp both halves.
719 */
720
721 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
722 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
723
724 res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
725 } else {
726 res = lp_build_lerp_simple(bld, x, v0, v1);
727 }
728
729 return res;
730 }
731
732
733 LLVMValueRef
734 lp_build_lerp_2d(struct lp_build_context *bld,
735 LLVMValueRef x,
736 LLVMValueRef y,
737 LLVMValueRef v00,
738 LLVMValueRef v01,
739 LLVMValueRef v10,
740 LLVMValueRef v11)
741 {
742 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
743 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
744 return lp_build_lerp(bld, y, v0, v1);
745 }
746
747
748 /**
749 * Generate min(a, b)
750 * Do checks for special cases.
751 */
752 LLVMValueRef
753 lp_build_min(struct lp_build_context *bld,
754 LLVMValueRef a,
755 LLVMValueRef b)
756 {
757 assert(lp_check_value(bld->type, a));
758 assert(lp_check_value(bld->type, b));
759
760 if(a == bld->undef || b == bld->undef)
761 return bld->undef;
762
763 if(a == b)
764 return a;
765
766 if(bld->type.norm) {
767 if(a == bld->zero || b == bld->zero)
768 return bld->zero;
769 if(a == bld->one)
770 return b;
771 if(b == bld->one)
772 return a;
773 }
774
775 return lp_build_min_simple(bld, a, b);
776 }
777
778
779 /**
780 * Generate max(a, b)
781 * Do checks for special cases.
782 */
783 LLVMValueRef
784 lp_build_max(struct lp_build_context *bld,
785 LLVMValueRef a,
786 LLVMValueRef b)
787 {
788 assert(lp_check_value(bld->type, a));
789 assert(lp_check_value(bld->type, b));
790
791 if(a == bld->undef || b == bld->undef)
792 return bld->undef;
793
794 if(a == b)
795 return a;
796
797 if(bld->type.norm) {
798 if(a == bld->one || b == bld->one)
799 return bld->one;
800 if(a == bld->zero)
801 return b;
802 if(b == bld->zero)
803 return a;
804 }
805
806 return lp_build_max_simple(bld, a, b);
807 }
808
809
810 /**
811 * Generate clamp(a, min, max)
812 * Do checks for special cases.
813 */
814 LLVMValueRef
815 lp_build_clamp(struct lp_build_context *bld,
816 LLVMValueRef a,
817 LLVMValueRef min,
818 LLVMValueRef max)
819 {
820 assert(lp_check_value(bld->type, a));
821 assert(lp_check_value(bld->type, min));
822 assert(lp_check_value(bld->type, max));
823
824 a = lp_build_min(bld, a, max);
825 a = lp_build_max(bld, a, min);
826 return a;
827 }
828
829
830 /**
831 * Generate abs(a)
832 */
833 LLVMValueRef
834 lp_build_abs(struct lp_build_context *bld,
835 LLVMValueRef a)
836 {
837 LLVMBuilderRef builder = bld->gallivm->builder;
838 const struct lp_type type = bld->type;
839 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
840
841 assert(lp_check_value(type, a));
842
843 if(!type.sign)
844 return a;
845
846 if(type.floating) {
847 /* Mask out the sign bit */
848 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
849 unsigned long long absMask = ~(1ULL << (type.width - 1));
850 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
851 a = LLVMBuildBitCast(builder, a, int_vec_type, "");
852 a = LLVMBuildAnd(builder, a, mask, "");
853 a = LLVMBuildBitCast(builder, a, vec_type, "");
854 return a;
855 }
856
857 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
858 switch(type.width) {
859 case 8:
860 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
861 case 16:
862 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
863 case 32:
864 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
865 }
866 }
867
868 return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
869 }
870
871
872 LLVMValueRef
873 lp_build_negate(struct lp_build_context *bld,
874 LLVMValueRef a)
875 {
876 LLVMBuilderRef builder = bld->gallivm->builder;
877
878 assert(lp_check_value(bld->type, a));
879
880 #if HAVE_LLVM >= 0x0207
881 if (bld->type.floating)
882 a = LLVMBuildFNeg(builder, a, "");
883 else
884 #endif
885 a = LLVMBuildNeg(builder, a, "");
886
887 return a;
888 }
889
890
891 /** Return -1, 0 or +1 depending on the sign of a */
892 LLVMValueRef
893 lp_build_sgn(struct lp_build_context *bld,
894 LLVMValueRef a)
895 {
896 LLVMBuilderRef builder = bld->gallivm->builder;
897 const struct lp_type type = bld->type;
898 LLVMValueRef cond;
899 LLVMValueRef res;
900
901 assert(lp_check_value(type, a));
902
903 /* Handle non-zero case */
904 if(!type.sign) {
905 /* if not zero then sign must be positive */
906 res = bld->one;
907 }
908 else if(type.floating) {
909 LLVMTypeRef vec_type;
910 LLVMTypeRef int_type;
911 LLVMValueRef mask;
912 LLVMValueRef sign;
913 LLVMValueRef one;
914 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
915
916 int_type = lp_build_int_vec_type(bld->gallivm, type);
917 vec_type = lp_build_vec_type(bld->gallivm, type);
918 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
919
920 /* Take the sign bit and add it to 1 constant */
921 sign = LLVMBuildBitCast(builder, a, int_type, "");
922 sign = LLVMBuildAnd(builder, sign, mask, "");
923 one = LLVMConstBitCast(bld->one, int_type);
924 res = LLVMBuildOr(builder, sign, one, "");
925 res = LLVMBuildBitCast(builder, res, vec_type, "");
926 }
927 else
928 {
929 LLVMValueRef minus_one = lp_build_const_int_vec(bld->gallivm, type, -1.0);
930 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
931 res = lp_build_select(bld, cond, bld->one, minus_one);
932 }
933
934 /* Handle zero */
935 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
936 res = lp_build_select(bld, cond, bld->zero, res);
937
938 return res;
939 }
940
941
942 /**
943 * Set the sign of float vector 'a' according to 'sign'.
944 * If sign==0, return abs(a).
945 * If sign==1, return -abs(a);
946 * Other values for sign produce undefined results.
947 */
948 LLVMValueRef
949 lp_build_set_sign(struct lp_build_context *bld,
950 LLVMValueRef a, LLVMValueRef sign)
951 {
952 LLVMBuilderRef builder = bld->gallivm->builder;
953 const struct lp_type type = bld->type;
954 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
955 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
956 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
957 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
958 ~((unsigned long long) 1 << (type.width - 1)));
959 LLVMValueRef val, res;
960
961 assert(type.floating);
962 assert(lp_check_value(type, a));
963
964 /* val = reinterpret_cast<int>(a) */
965 val = LLVMBuildBitCast(builder, a, int_vec_type, "");
966 /* val = val & mask */
967 val = LLVMBuildAnd(builder, val, mask, "");
968 /* sign = sign << shift */
969 sign = LLVMBuildShl(builder, sign, shift, "");
970 /* res = val | sign */
971 res = LLVMBuildOr(builder, val, sign, "");
972 /* res = reinterpret_cast<float>(res) */
973 res = LLVMBuildBitCast(builder, res, vec_type, "");
974
975 return res;
976 }
977
978
979 /**
980 * Convert vector of (or scalar) int to vector of (or scalar) float.
981 */
982 LLVMValueRef
983 lp_build_int_to_float(struct lp_build_context *bld,
984 LLVMValueRef a)
985 {
986 LLVMBuilderRef builder = bld->gallivm->builder;
987 const struct lp_type type = bld->type;
988 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
989
990 assert(type.floating);
991
992 return LLVMBuildSIToFP(builder, a, vec_type, "");
993 }
994
995
996
997 enum lp_build_round_sse41_mode
998 {
999 LP_BUILD_ROUND_SSE41_NEAREST = 0,
1000 LP_BUILD_ROUND_SSE41_FLOOR = 1,
1001 LP_BUILD_ROUND_SSE41_CEIL = 2,
1002 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
1003 };
1004
1005
1006 /**
1007 * Helper for SSE4.1's ROUNDxx instructions.
1008 *
1009 * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
1010 * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0.
1011 */
1012 static INLINE LLVMValueRef
1013 lp_build_round_sse41(struct lp_build_context *bld,
1014 LLVMValueRef a,
1015 enum lp_build_round_sse41_mode mode)
1016 {
1017 LLVMBuilderRef builder = bld->gallivm->builder;
1018 const struct lp_type type = bld->type;
1019 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1020 const char *intrinsic;
1021 LLVMValueRef res;
1022
1023 assert(type.floating);
1024
1025 assert(lp_check_value(type, a));
1026 assert(util_cpu_caps.has_sse4_1);
1027
1028 if (type.length == 1) {
1029 LLVMTypeRef vec_type;
1030 LLVMValueRef undef;
1031 LLVMValueRef args[3];
1032 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1033
1034 switch(type.width) {
1035 case 32:
1036 intrinsic = "llvm.x86.sse41.round.ss";
1037 break;
1038 case 64:
1039 intrinsic = "llvm.x86.sse41.round.sd";
1040 break;
1041 default:
1042 assert(0);
1043 return bld->undef;
1044 }
1045
1046 vec_type = LLVMVectorType(bld->elem_type, 4);
1047
1048 undef = LLVMGetUndef(vec_type);
1049
1050 args[0] = undef;
1051 args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
1052 args[2] = LLVMConstInt(i32t, mode, 0);
1053
1054 res = lp_build_intrinsic(builder, intrinsic,
1055 vec_type, args, Elements(args));
1056
1057 res = LLVMBuildExtractElement(builder, res, index0, "");
1058 }
1059 else {
1060 assert(type.width*type.length == 128);
1061
1062 switch(type.width) {
1063 case 32:
1064 intrinsic = "llvm.x86.sse41.round.ps";
1065 break;
1066 case 64:
1067 intrinsic = "llvm.x86.sse41.round.pd";
1068 break;
1069 default:
1070 assert(0);
1071 return bld->undef;
1072 }
1073
1074 res = lp_build_intrinsic_binary(builder, intrinsic,
1075 bld->vec_type, a,
1076 LLVMConstInt(i32t, mode, 0));
1077 }
1078
1079 return res;
1080 }
1081
1082
1083 static INLINE LLVMValueRef
1084 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1085 LLVMValueRef a)
1086 {
1087 LLVMBuilderRef builder = bld->gallivm->builder;
1088 const struct lp_type type = bld->type;
1089 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1090 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1091 const char *intrinsic;
1092 LLVMValueRef res;
1093
1094 assert(type.floating);
1095 /* using the double precision conversions is a bit more complicated */
1096 assert(type.width == 32);
1097
1098 assert(lp_check_value(type, a));
1099 assert(util_cpu_caps.has_sse2);
1100
1101 /* This is relying on MXCSR rounding mode, which should always be nearest. */
1102 if (type.length == 1) {
1103 LLVMTypeRef vec_type;
1104 LLVMValueRef undef;
1105 LLVMValueRef arg;
1106 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1107
1108 vec_type = LLVMVectorType(bld->elem_type, 4);
1109
1110 intrinsic = "llvm.x86.sse.cvtss2si";
1111
1112 undef = LLVMGetUndef(vec_type);
1113
1114 arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1115
1116 res = lp_build_intrinsic_unary(builder, intrinsic,
1117 ret_type, arg);
1118 }
1119 else {
1120 assert(type.width*type.length == 128);
1121
1122 intrinsic = "llvm.x86.sse2.cvtps2dq";
1123
1124 res = lp_build_intrinsic_unary(builder, intrinsic,
1125 ret_type, a);
1126 }
1127
1128 return res;
1129 }
1130
1131
1132 /**
1133 * Return the integer part of a float (vector) value (== round toward zero).
1134 * The returned value is a float (vector).
1135 * Ex: trunc(-1.5) = -1.0
1136 */
1137 LLVMValueRef
1138 lp_build_trunc(struct lp_build_context *bld,
1139 LLVMValueRef a)
1140 {
1141 LLVMBuilderRef builder = bld->gallivm->builder;
1142 const struct lp_type type = bld->type;
1143
1144 assert(type.floating);
1145 assert(lp_check_value(type, a));
1146
1147 if (util_cpu_caps.has_sse4_1 &&
1148 (type.length == 1 || type.width*type.length == 128)) {
1149 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
1150 }
1151 else {
1152 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1153 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1154 LLVMValueRef res;
1155 res = LLVMBuildFPToSI(builder, a, int_vec_type, "");
1156 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1157 return res;
1158 }
1159 }
1160
1161
1162 /**
1163 * Return float (vector) rounded to nearest integer (vector). The returned
1164 * value is a float (vector).
1165 * Ex: round(0.9) = 1.0
1166 * Ex: round(-1.5) = -2.0
1167 */
1168 LLVMValueRef
1169 lp_build_round(struct lp_build_context *bld,
1170 LLVMValueRef a)
1171 {
1172 LLVMBuilderRef builder = bld->gallivm->builder;
1173 const struct lp_type type = bld->type;
1174
1175 assert(type.floating);
1176 assert(lp_check_value(type, a));
1177
1178 if (util_cpu_caps.has_sse4_1 &&
1179 (type.length == 1 || type.width*type.length == 128)) {
1180 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1181 }
1182 else {
1183 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1184 LLVMValueRef res;
1185 res = lp_build_iround(bld, a);
1186 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1187 return res;
1188 }
1189 }
1190
1191
1192 /**
1193 * Return floor of float (vector), result is a float (vector)
1194 * Ex: floor(1.1) = 1.0
1195 * Ex: floor(-1.1) = -2.0
1196 */
1197 LLVMValueRef
1198 lp_build_floor(struct lp_build_context *bld,
1199 LLVMValueRef a)
1200 {
1201 LLVMBuilderRef builder = bld->gallivm->builder;
1202 const struct lp_type type = bld->type;
1203
1204 assert(type.floating);
1205 assert(lp_check_value(type, a));
1206
1207 if (util_cpu_caps.has_sse4_1 &&
1208 (type.length == 1 || type.width*type.length == 128)) {
1209 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1210 }
1211 else {
1212 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1213 LLVMValueRef res;
1214 res = lp_build_ifloor(bld, a);
1215 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1216 return res;
1217 }
1218 }
1219
1220
1221 /**
1222 * Return ceiling of float (vector), returning float (vector).
1223 * Ex: ceil( 1.1) = 2.0
1224 * Ex: ceil(-1.1) = -1.0
1225 */
1226 LLVMValueRef
1227 lp_build_ceil(struct lp_build_context *bld,
1228 LLVMValueRef a)
1229 {
1230 LLVMBuilderRef builder = bld->gallivm->builder;
1231 const struct lp_type type = bld->type;
1232
1233 assert(type.floating);
1234 assert(lp_check_value(type, a));
1235
1236 if (util_cpu_caps.has_sse4_1 &&
1237 (type.length == 1 || type.width*type.length == 128)) {
1238 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1239 }
1240 else {
1241 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1242 LLVMValueRef res;
1243 res = lp_build_iceil(bld, a);
1244 res = LLVMBuildSIToFP(builder, res, vec_type, "");
1245 return res;
1246 }
1247 }
1248
1249
1250 /**
1251 * Return fractional part of 'a' computed as a - floor(a)
1252 * Typically used in texture coord arithmetic.
1253 */
1254 LLVMValueRef
1255 lp_build_fract(struct lp_build_context *bld,
1256 LLVMValueRef a)
1257 {
1258 assert(bld->type.floating);
1259 return lp_build_sub(bld, a, lp_build_floor(bld, a));
1260 }
1261
1262
1263 /**
1264 * Return the integer part of a float (vector) value (== round toward zero).
1265 * The returned value is an integer (vector).
1266 * Ex: itrunc(-1.5) = -1
1267 */
1268 LLVMValueRef
1269 lp_build_itrunc(struct lp_build_context *bld,
1270 LLVMValueRef a)
1271 {
1272 LLVMBuilderRef builder = bld->gallivm->builder;
1273 const struct lp_type type = bld->type;
1274 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1275
1276 assert(type.floating);
1277 assert(lp_check_value(type, a));
1278
1279 return LLVMBuildFPToSI(builder, a, int_vec_type, "");
1280 }
1281
1282
1283 /**
1284 * Return float (vector) rounded to nearest integer (vector). The returned
1285 * value is an integer (vector).
1286 * Ex: iround(0.9) = 1
1287 * Ex: iround(-1.5) = -2
1288 */
1289 LLVMValueRef
1290 lp_build_iround(struct lp_build_context *bld,
1291 LLVMValueRef a)
1292 {
1293 LLVMBuilderRef builder = bld->gallivm->builder;
1294 const struct lp_type type = bld->type;
1295 LLVMTypeRef int_vec_type = bld->int_vec_type;
1296 LLVMValueRef res;
1297
1298 assert(type.floating);
1299
1300 assert(lp_check_value(type, a));
1301
1302 if (util_cpu_caps.has_sse2 &&
1303 ((type.width == 32) && (type.length == 1 || type.length == 4))) {
1304 return lp_build_iround_nearest_sse2(bld, a);
1305 }
1306 else if (util_cpu_caps.has_sse4_1 &&
1307 (type.length == 1 || type.width*type.length == 128)) {
1308 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
1309 }
1310 else {
1311 LLVMValueRef half;
1312
1313 half = lp_build_const_vec(bld->gallivm, type, 0.5);
1314
1315 if (type.sign) {
1316 LLVMTypeRef vec_type = bld->vec_type;
1317 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1318 (unsigned long long)1 << (type.width - 1));
1319 LLVMValueRef sign;
1320
1321 /* get sign bit */
1322 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1323 sign = LLVMBuildAnd(builder, sign, mask, "");
1324
1325 /* sign * 0.5 */
1326 half = LLVMBuildBitCast(builder, half, int_vec_type, "");
1327 half = LLVMBuildOr(builder, sign, half, "");
1328 half = LLVMBuildBitCast(builder, half, vec_type, "");
1329 }
1330
1331 res = LLVMBuildFAdd(builder, a, half, "");
1332 }
1333
1334 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
1335
1336 return res;
1337 }
1338
1339
1340 /**
1341 * Return floor of float (vector), result is an int (vector)
1342 * Ex: ifloor(1.1) = 1.0
1343 * Ex: ifloor(-1.1) = -2.0
1344 */
1345 LLVMValueRef
1346 lp_build_ifloor(struct lp_build_context *bld,
1347 LLVMValueRef a)
1348 {
1349 LLVMBuilderRef builder = bld->gallivm->builder;
1350 const struct lp_type type = bld->type;
1351 LLVMTypeRef int_vec_type = bld->int_vec_type;
1352 LLVMValueRef res;
1353
1354 assert(type.floating);
1355 assert(lp_check_value(type, a));
1356
1357 if (util_cpu_caps.has_sse4_1 &&
1358 (type.length == 1 || type.width*type.length == 128)) {
1359 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
1360 }
1361 else {
1362 res = a;
1363
1364 if (type.sign) {
1365 /* Take the sign bit and add it to 1 constant */
1366 LLVMTypeRef vec_type = bld->vec_type;
1367 unsigned mantissa = lp_mantissa(type);
1368 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1369 (unsigned long long)1 << (type.width - 1));
1370 LLVMValueRef sign;
1371 LLVMValueRef offset;
1372
1373 /* sign = a < 0 ? ~0 : 0 */
1374 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1375 sign = LLVMBuildAnd(builder, sign, mask, "");
1376 sign = LLVMBuildAShr(builder, sign,
1377 lp_build_const_int_vec(bld->gallivm, type,
1378 type.width - 1),
1379 "ifloor.sign");
1380
1381 /* offset = -0.99999(9)f */
1382 offset = lp_build_const_vec(bld->gallivm, type,
1383 -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1384 offset = LLVMConstBitCast(offset, int_vec_type);
1385
1386 /* offset = a < 0 ? offset : 0.0f */
1387 offset = LLVMBuildAnd(builder, offset, sign, "");
1388 offset = LLVMBuildBitCast(builder, offset, vec_type, "ifloor.offset");
1389
1390 res = LLVMBuildFAdd(builder, res, offset, "ifloor.res");
1391 }
1392 }
1393
1394 /* round to nearest (toward zero) */
1395 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
1396
1397 return res;
1398 }
1399
1400
1401 /**
1402 * Return ceiling of float (vector), returning int (vector).
1403 * Ex: iceil( 1.1) = 2
1404 * Ex: iceil(-1.1) = -1
1405 */
1406 LLVMValueRef
1407 lp_build_iceil(struct lp_build_context *bld,
1408 LLVMValueRef a)
1409 {
1410 LLVMBuilderRef builder = bld->gallivm->builder;
1411 const struct lp_type type = bld->type;
1412 LLVMTypeRef int_vec_type = bld->int_vec_type;
1413 LLVMValueRef res;
1414
1415 assert(type.floating);
1416 assert(lp_check_value(type, a));
1417
1418 if (util_cpu_caps.has_sse4_1 &&
1419 (type.length == 1 || type.width*type.length == 128)) {
1420 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
1421 }
1422 else {
1423 LLVMTypeRef vec_type = bld->vec_type;
1424 unsigned mantissa = lp_mantissa(type);
1425 LLVMValueRef offset;
1426
1427 /* offset = 0.99999(9)f */
1428 offset = lp_build_const_vec(bld->gallivm, type,
1429 (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
1430
1431 if (type.sign) {
1432 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1433 (unsigned long long)1 << (type.width - 1));
1434 LLVMValueRef sign;
1435
1436 /* sign = a < 0 ? 0 : ~0 */
1437 sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
1438 sign = LLVMBuildAnd(builder, sign, mask, "");
1439 sign = LLVMBuildAShr(builder, sign,
1440 lp_build_const_int_vec(bld->gallivm, type,
1441 type.width - 1),
1442 "iceil.sign");
1443 sign = LLVMBuildNot(builder, sign, "iceil.not");
1444
1445 /* offset = a < 0 ? 0.0 : offset */
1446 offset = LLVMConstBitCast(offset, int_vec_type);
1447 offset = LLVMBuildAnd(builder, offset, sign, "");
1448 offset = LLVMBuildBitCast(builder, offset, vec_type, "iceil.offset");
1449 }
1450
1451 res = LLVMBuildFAdd(builder, a, offset, "iceil.res");
1452 }
1453
1454 /* round to nearest (toward zero) */
1455 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
1456
1457 return res;
1458 }
1459
1460
1461 /**
1462 * Combined ifloor() & fract().
1463 *
1464 * Preferred to calling the functions separately, as it will ensure that the
1465 * stratergy (floor() vs ifloor()) that results in less redundant work is used.
1466 */
1467 void
1468 lp_build_ifloor_fract(struct lp_build_context *bld,
1469 LLVMValueRef a,
1470 LLVMValueRef *out_ipart,
1471 LLVMValueRef *out_fpart)
1472 {
1473 LLVMBuilderRef builder = bld->gallivm->builder;
1474 const struct lp_type type = bld->type;
1475 LLVMValueRef ipart;
1476
1477 assert(type.floating);
1478 assert(lp_check_value(type, a));
1479
1480 if (util_cpu_caps.has_sse4_1 &&
1481 (type.length == 1 || type.width*type.length == 128)) {
1482 /*
1483 * floor() is easier.
1484 */
1485
1486 ipart = lp_build_floor(bld, a);
1487 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1488 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
1489 }
1490 else {
1491 /*
1492 * ifloor() is easier.
1493 */
1494
1495 *out_ipart = lp_build_ifloor(bld, a);
1496 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
1497 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
1498 }
1499 }
1500
1501
1502 LLVMValueRef
1503 lp_build_sqrt(struct lp_build_context *bld,
1504 LLVMValueRef a)
1505 {
1506 LLVMBuilderRef builder = bld->gallivm->builder;
1507 const struct lp_type type = bld->type;
1508 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1509 char intrinsic[32];
1510
1511 assert(lp_check_value(type, a));
1512
1513 /* TODO: optimize the constant case */
1514 /* TODO: optimize the constant case */
1515
1516 assert(type.floating);
1517 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
1518
1519 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1520 }
1521
1522
1523 /**
1524 * Do one Newton-Raphson step to improve reciprocate precision:
1525 *
1526 * x_{i+1} = x_i * (2 - a * x_i)
1527 *
1528 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
1529 * +/-Inf, giving NaN instead. Certain applications rely on this behavior,
1530 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
1531 * halo. It would be necessary to clamp the argument to prevent this.
1532 *
1533 * See also:
1534 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
1535 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1536 */
1537 static INLINE LLVMValueRef
1538 lp_build_rcp_refine(struct lp_build_context *bld,
1539 LLVMValueRef a,
1540 LLVMValueRef rcp_a)
1541 {
1542 LLVMBuilderRef builder = bld->gallivm->builder;
1543 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
1544 LLVMValueRef res;
1545
1546 res = LLVMBuildFMul(builder, a, rcp_a, "");
1547 res = LLVMBuildFSub(builder, two, res, "");
1548 res = LLVMBuildFMul(builder, rcp_a, res, "");
1549
1550 return res;
1551 }
1552
1553
1554 LLVMValueRef
1555 lp_build_rcp(struct lp_build_context *bld,
1556 LLVMValueRef a)
1557 {
1558 LLVMBuilderRef builder = bld->gallivm->builder;
1559 const struct lp_type type = bld->type;
1560
1561 assert(lp_check_value(type, a));
1562
1563 if(a == bld->zero)
1564 return bld->undef;
1565 if(a == bld->one)
1566 return bld->one;
1567 if(a == bld->undef)
1568 return bld->undef;
1569
1570 assert(type.floating);
1571
1572 if(LLVMIsConstant(a))
1573 return LLVMConstFDiv(bld->one, a);
1574
1575 /*
1576 * We don't use RCPPS because:
1577 * - it only has 10bits of precision
1578 * - it doesn't even get the reciprocate of 1.0 exactly
1579 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
1580 * - for recent processors the benefit over DIVPS is marginal, a case
1581 * depedent
1582 *
1583 * We could still use it on certain processors if benchmarks show that the
1584 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
1585 * particular uses that require less workarounds.
1586 */
1587
1588 if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1589 const unsigned num_iterations = 0;
1590 LLVMValueRef res;
1591 unsigned i;
1592
1593 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
1594
1595 for (i = 0; i < num_iterations; ++i) {
1596 res = lp_build_rcp_refine(bld, a, res);
1597 }
1598
1599 return res;
1600 }
1601
1602 return LLVMBuildFDiv(builder, bld->one, a, "");
1603 }
1604
1605
1606 /**
1607 * Do one Newton-Raphson step to improve rsqrt precision:
1608 *
1609 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
1610 *
1611 * See also:
1612 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
1613 */
1614 static INLINE LLVMValueRef
1615 lp_build_rsqrt_refine(struct lp_build_context *bld,
1616 LLVMValueRef a,
1617 LLVMValueRef rsqrt_a)
1618 {
1619 LLVMBuilderRef builder = bld->gallivm->builder;
1620 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
1621 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
1622 LLVMValueRef res;
1623
1624 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
1625 res = LLVMBuildFMul(builder, a, res, "");
1626 res = LLVMBuildFSub(builder, three, res, "");
1627 res = LLVMBuildFMul(builder, rsqrt_a, res, "");
1628 res = LLVMBuildFMul(builder, half, res, "");
1629
1630 return res;
1631 }
1632
1633
1634 /**
1635 * Generate 1/sqrt(a)
1636 */
1637 LLVMValueRef
1638 lp_build_rsqrt(struct lp_build_context *bld,
1639 LLVMValueRef a)
1640 {
1641 LLVMBuilderRef builder = bld->gallivm->builder;
1642 const struct lp_type type = bld->type;
1643
1644 assert(lp_check_value(type, a));
1645
1646 assert(type.floating);
1647
1648 if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
1649 const unsigned num_iterations = 1;
1650 LLVMValueRef res;
1651 unsigned i;
1652
1653 res = lp_build_intrinsic_unary(builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
1654
1655 for (i = 0; i < num_iterations; ++i) {
1656 res = lp_build_rsqrt_refine(bld, a, res);
1657 }
1658
1659 return res;
1660 }
1661
1662 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
1663 }
1664
1665
1666 /**
1667 * Generate sin(a) using SSE2
1668 */
1669 LLVMValueRef
1670 lp_build_sin(struct lp_build_context *bld,
1671 LLVMValueRef a)
1672 {
1673 struct gallivm_state *gallivm = bld->gallivm;
1674 LLVMBuilderRef builder = gallivm->builder;
1675 struct lp_type int_type = lp_int_type(bld->type);
1676 LLVMBuilderRef b = builder;
1677
1678 /*
1679 * take the absolute value,
1680 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1681 */
1682
1683 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1684 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1685
1686 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1687 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1688
1689 /*
1690 * extract the sign bit (upper one)
1691 * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
1692 */
1693 LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
1694 LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
1695
1696 /*
1697 * scale by 4/Pi
1698 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1699 */
1700
1701 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1702 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1703
1704 /*
1705 * store the integer part of y in mm0
1706 * emm2 = _mm_cvttps_epi32(y);
1707 */
1708
1709 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1710
1711 /*
1712 * j=(j+1) & (~1) (see the cephes sources)
1713 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1714 */
1715
1716 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1717 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1718 /*
1719 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1720 */
1721 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1722 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1723
1724 /*
1725 * y = _mm_cvtepi32_ps(emm2);
1726 */
1727 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1728
1729 /* get the swap sign flag
1730 * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
1731 */
1732 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1733 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
1734
1735 /*
1736 * emm2 = _mm_slli_epi32(emm0, 29);
1737 */
1738 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1739 LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
1740
1741 /*
1742 * get the polynom selection mask
1743 * there is one polynom for 0 <= x <= Pi/4
1744 * and another one for Pi/4<x<=Pi/2
1745 * Both branches will be computed.
1746 *
1747 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1748 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1749 */
1750
1751 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1752 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
1753 LLVMValueRef poly_mask = lp_build_compare(gallivm,
1754 int_type, PIPE_FUNC_EQUAL,
1755 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1756 /*
1757 * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
1758 */
1759 LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
1760
1761 /*
1762 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1763 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1764 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1765 */
1766 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1767 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1768 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1769
1770 /*
1771 * The magic pass: "Extended precision modular arithmetic"
1772 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1773 * xmm1 = _mm_mul_ps(y, xmm1);
1774 * xmm2 = _mm_mul_ps(y, xmm2);
1775 * xmm3 = _mm_mul_ps(y, xmm3);
1776 */
1777 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1778 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1779 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1780
1781 /*
1782 * x = _mm_add_ps(x, xmm1);
1783 * x = _mm_add_ps(x, xmm2);
1784 * x = _mm_add_ps(x, xmm3);
1785 */
1786
1787 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
1788 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
1789 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
1790
1791 /*
1792 * Evaluate the first polynom (0 <= x <= Pi/4)
1793 *
1794 * z = _mm_mul_ps(x,x);
1795 */
1796 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
1797
1798 /*
1799 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
1800 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
1801 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
1802 */
1803 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
1804 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
1805 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
1806
1807 /*
1808 * y = *(v4sf*)_ps_coscof_p0;
1809 * y = _mm_mul_ps(y, z);
1810 */
1811 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
1812 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
1813 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
1814 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
1815 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
1816 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
1817
1818
1819 /*
1820 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
1821 * y = _mm_sub_ps(y, tmp);
1822 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
1823 */
1824 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
1825 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
1826 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
1827 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
1828 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
1829
1830 /*
1831 * _PS_CONST(sincof_p0, -1.9515295891E-4);
1832 * _PS_CONST(sincof_p1, 8.3321608736E-3);
1833 * _PS_CONST(sincof_p2, -1.6666654611E-1);
1834 */
1835 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
1836 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
1837 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
1838
1839 /*
1840 * Evaluate the second polynom (Pi/4 <= x <= 0)
1841 *
1842 * y2 = *(v4sf*)_ps_sincof_p0;
1843 * y2 = _mm_mul_ps(y2, z);
1844 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
1845 * y2 = _mm_mul_ps(y2, z);
1846 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
1847 * y2 = _mm_mul_ps(y2, z);
1848 * y2 = _mm_mul_ps(y2, x);
1849 * y2 = _mm_add_ps(y2, x);
1850 */
1851
1852 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
1853 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
1854 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
1855 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
1856 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
1857 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
1858 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
1859
1860 /*
1861 * select the correct result from the two polynoms
1862 * xmm3 = poly_mask;
1863 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
1864 * y = _mm_andnot_ps(xmm3, y);
1865 * y = _mm_add_ps(y,y2);
1866 */
1867 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
1868 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
1869 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
1870 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1871 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
1872 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
1873 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
1874
1875 /*
1876 * update the sign
1877 * y = _mm_xor_ps(y, sign_bit);
1878 */
1879 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
1880 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
1881 return y_result;
1882 }
1883
1884
1885 /**
1886 * Generate cos(a) using SSE2
1887 */
1888 LLVMValueRef
1889 lp_build_cos(struct lp_build_context *bld,
1890 LLVMValueRef a)
1891 {
1892 struct gallivm_state *gallivm = bld->gallivm;
1893 LLVMBuilderRef builder = gallivm->builder;
1894 struct lp_type int_type = lp_int_type(bld->type);
1895 LLVMBuilderRef b = builder;
1896
1897 /*
1898 * take the absolute value,
1899 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
1900 */
1901
1902 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
1903 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
1904
1905 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
1906 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
1907
1908 /*
1909 * scale by 4/Pi
1910 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
1911 */
1912
1913 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
1914 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
1915
1916 /*
1917 * store the integer part of y in mm0
1918 * emm2 = _mm_cvttps_epi32(y);
1919 */
1920
1921 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
1922
1923 /*
1924 * j=(j+1) & (~1) (see the cephes sources)
1925 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
1926 */
1927
1928 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
1929 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
1930 /*
1931 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
1932 */
1933 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
1934 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
1935
1936 /*
1937 * y = _mm_cvtepi32_ps(emm2);
1938 */
1939 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
1940
1941
1942 /*
1943 * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
1944 */
1945 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1946 LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
1947
1948
1949 /* get the swap sign flag
1950 * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
1951 */
1952 LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
1953 LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
1954 LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
1955 LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
1956
1957 /*
1958 * emm2 = _mm_slli_epi32(emm0, 29);
1959 */
1960 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
1961 LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
1962
1963 /*
1964 * get the polynom selection mask
1965 * there is one polynom for 0 <= x <= Pi/4
1966 * and another one for Pi/4<x<=Pi/2
1967 * Both branches will be computed.
1968 *
1969 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
1970 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
1971 */
1972
1973 LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
1974 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
1975 LLVMValueRef poly_mask = lp_build_compare(gallivm,
1976 int_type, PIPE_FUNC_EQUAL,
1977 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
1978
1979 /*
1980 * _PS_CONST(minus_cephes_DP1, -0.78515625);
1981 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
1982 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
1983 */
1984 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
1985 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
1986 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
1987
1988 /*
1989 * The magic pass: "Extended precision modular arithmetic"
1990 * x = ((x - y * DP1) - y * DP2) - y * DP3;
1991 * xmm1 = _mm_mul_ps(y, xmm1);
1992 * xmm2 = _mm_mul_ps(y, xmm2);
1993 * xmm3 = _mm_mul_ps(y, xmm3);
1994 */
1995 LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
1996 LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
1997 LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
1998
1999 /*
2000 * x = _mm_add_ps(x, xmm1);
2001 * x = _mm_add_ps(x, xmm2);
2002 * x = _mm_add_ps(x, xmm3);
2003 */
2004
2005 LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
2006 LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
2007 LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
2008
2009 /*
2010 * Evaluate the first polynom (0 <= x <= Pi/4)
2011 *
2012 * z = _mm_mul_ps(x,x);
2013 */
2014 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2015
2016 /*
2017 * _PS_CONST(coscof_p0, 2.443315711809948E-005);
2018 * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2019 * _PS_CONST(coscof_p2, 4.166664568298827E-002);
2020 */
2021 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2022 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2023 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2024
2025 /*
2026 * y = *(v4sf*)_ps_coscof_p0;
2027 * y = _mm_mul_ps(y, z);
2028 */
2029 LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
2030 LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
2031 LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
2032 LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
2033 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2034 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2035
2036
2037 /*
2038 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2039 * y = _mm_sub_ps(y, tmp);
2040 * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2041 */
2042 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2043 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2044 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2045 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2046 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2047
2048 /*
2049 * _PS_CONST(sincof_p0, -1.9515295891E-4);
2050 * _PS_CONST(sincof_p1, 8.3321608736E-3);
2051 * _PS_CONST(sincof_p2, -1.6666654611E-1);
2052 */
2053 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2054 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2055 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2056
2057 /*
2058 * Evaluate the second polynom (Pi/4 <= x <= 0)
2059 *
2060 * y2 = *(v4sf*)_ps_sincof_p0;
2061 * y2 = _mm_mul_ps(y2, z);
2062 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2063 * y2 = _mm_mul_ps(y2, z);
2064 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2065 * y2 = _mm_mul_ps(y2, z);
2066 * y2 = _mm_mul_ps(y2, x);
2067 * y2 = _mm_add_ps(y2, x);
2068 */
2069
2070 LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
2071 LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
2072 LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
2073 LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
2074 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2075 LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
2076 LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
2077
2078 /*
2079 * select the correct result from the two polynoms
2080 * xmm3 = poly_mask;
2081 * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
2082 * y = _mm_andnot_ps(xmm3, y);
2083 * y = _mm_add_ps(y,y2);
2084 */
2085 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
2086 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
2087 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
2088 LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
2089 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
2090 LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
2091
2092 /*
2093 * update the sign
2094 * y = _mm_xor_ps(y, sign_bit);
2095 */
2096 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
2097 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
2098 return y_result;
2099 }
2100
2101
2102 /**
2103 * Generate pow(x, y)
2104 */
2105 LLVMValueRef
2106 lp_build_pow(struct lp_build_context *bld,
2107 LLVMValueRef x,
2108 LLVMValueRef y)
2109 {
2110 /* TODO: optimize the constant case */
2111 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2112 LLVMIsConstant(x) && LLVMIsConstant(y)) {
2113 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2114 __FUNCTION__);
2115 }
2116
2117 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
2118 }
2119
2120
2121 /**
2122 * Generate exp(x)
2123 */
2124 LLVMValueRef
2125 lp_build_exp(struct lp_build_context *bld,
2126 LLVMValueRef x)
2127 {
2128 /* log2(e) = 1/log(2) */
2129 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
2130 1.4426950408889634);
2131
2132 assert(lp_check_value(bld->type, x));
2133
2134 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
2135 }
2136
2137
2138 /**
2139 * Generate log(x)
2140 */
2141 LLVMValueRef
2142 lp_build_log(struct lp_build_context *bld,
2143 LLVMValueRef x)
2144 {
2145 /* log(2) */
2146 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
2147 0.69314718055994529);
2148
2149 assert(lp_check_value(bld->type, x));
2150
2151 return lp_build_mul(bld, log2, lp_build_log2(bld, x));
2152 }
2153
2154
2155 /**
2156 * Generate polynomial.
2157 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
2158 */
2159 static LLVMValueRef
2160 lp_build_polynomial(struct lp_build_context *bld,
2161 LLVMValueRef x,
2162 const double *coeffs,
2163 unsigned num_coeffs)
2164 {
2165 const struct lp_type type = bld->type;
2166 LLVMValueRef even = NULL, odd = NULL;
2167 LLVMValueRef x2;
2168 unsigned i;
2169
2170 assert(lp_check_value(bld->type, x));
2171
2172 /* TODO: optimize the constant case */
2173 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2174 LLVMIsConstant(x)) {
2175 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2176 __FUNCTION__);
2177 }
2178
2179 /*
2180 * Calculate odd and even terms seperately to decrease data dependency
2181 * Ex:
2182 * c[0] + x^2 * c[2] + x^4 * c[4] ...
2183 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
2184 */
2185 x2 = lp_build_mul(bld, x, x);
2186
2187 for (i = num_coeffs; i--; ) {
2188 LLVMValueRef coeff;
2189
2190 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
2191
2192 if (i % 2 == 0) {
2193 if (even)
2194 even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
2195 else
2196 even = coeff;
2197 } else {
2198 if (odd)
2199 odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
2200 else
2201 odd = coeff;
2202 }
2203 }
2204
2205 if (odd)
2206 return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
2207 else if (even)
2208 return even;
2209 else
2210 return bld->undef;
2211 }
2212
2213
2214 /**
2215 * Minimax polynomial fit of 2**x, in range [0, 1[
2216 */
2217 const double lp_build_exp2_polynomial[] = {
2218 #if EXP_POLY_DEGREE == 5
2219 0.999999925063526176901,
2220 0.693153073200168932794,
2221 0.240153617044375388211,
2222 0.0558263180532956664775,
2223 0.00898934009049466391101,
2224 0.00187757667519147912699
2225 #elif EXP_POLY_DEGREE == 4
2226 1.00000259337069434683,
2227 0.693003834469974940458,
2228 0.24144275689150793076,
2229 0.0520114606103070150235,
2230 0.0135341679161270268764
2231 #elif EXP_POLY_DEGREE == 3
2232 0.999925218562710312959,
2233 0.695833540494823811697,
2234 0.226067155427249155588,
2235 0.0780245226406372992967
2236 #elif EXP_POLY_DEGREE == 2
2237 1.00172476321474503578,
2238 0.657636275736077639316,
2239 0.33718943461968720704
2240 #else
2241 #error
2242 #endif
2243 };
2244
2245
2246 void
2247 lp_build_exp2_approx(struct lp_build_context *bld,
2248 LLVMValueRef x,
2249 LLVMValueRef *p_exp2_int_part,
2250 LLVMValueRef *p_frac_part,
2251 LLVMValueRef *p_exp2)
2252 {
2253 LLVMBuilderRef builder = bld->gallivm->builder;
2254 const struct lp_type type = bld->type;
2255 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2256 LLVMValueRef ipart = NULL;
2257 LLVMValueRef fpart = NULL;
2258 LLVMValueRef expipart = NULL;
2259 LLVMValueRef expfpart = NULL;
2260 LLVMValueRef res = NULL;
2261
2262 assert(lp_check_value(bld->type, x));
2263
2264 if(p_exp2_int_part || p_frac_part || p_exp2) {
2265 /* TODO: optimize the constant case */
2266 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2267 LLVMIsConstant(x)) {
2268 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2269 __FUNCTION__);
2270 }
2271
2272 assert(type.floating && type.width == 32);
2273
2274 x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type, 129.0));
2275 x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
2276
2277 /* ipart = floor(x) */
2278 /* fpart = x - ipart */
2279 lp_build_ifloor_fract(bld, x, &ipart, &fpart);
2280 }
2281
2282 if(p_exp2_int_part || p_exp2) {
2283 /* expipart = (float) (1 << ipart) */
2284 expipart = LLVMBuildAdd(builder, ipart,
2285 lp_build_const_int_vec(bld->gallivm, type, 127), "");
2286 expipart = LLVMBuildShl(builder, expipart,
2287 lp_build_const_int_vec(bld->gallivm, type, 23), "");
2288 expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
2289 }
2290
2291 if(p_exp2) {
2292 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
2293 Elements(lp_build_exp2_polynomial));
2294
2295 res = LLVMBuildFMul(builder, expipart, expfpart, "");
2296 }
2297
2298 if(p_exp2_int_part)
2299 *p_exp2_int_part = expipart;
2300
2301 if(p_frac_part)
2302 *p_frac_part = fpart;
2303
2304 if(p_exp2)
2305 *p_exp2 = res;
2306 }
2307
2308
2309 LLVMValueRef
2310 lp_build_exp2(struct lp_build_context *bld,
2311 LLVMValueRef x)
2312 {
2313 LLVMValueRef res;
2314 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
2315 return res;
2316 }
2317
2318
2319 /**
2320 * Extract the exponent of a IEEE-754 floating point value.
2321 *
2322 * Optionally apply an integer bias.
2323 *
2324 * Result is an integer value with
2325 *
2326 * ifloor(log2(x)) + bias
2327 */
2328 LLVMValueRef
2329 lp_build_extract_exponent(struct lp_build_context *bld,
2330 LLVMValueRef x,
2331 int bias)
2332 {
2333 LLVMBuilderRef builder = bld->gallivm->builder;
2334 const struct lp_type type = bld->type;
2335 unsigned mantissa = lp_mantissa(type);
2336 LLVMValueRef res;
2337
2338 assert(type.floating);
2339
2340 assert(lp_check_value(bld->type, x));
2341
2342 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2343
2344 res = LLVMBuildLShr(builder, x,
2345 lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
2346 res = LLVMBuildAnd(builder, res,
2347 lp_build_const_int_vec(bld->gallivm, type, 255), "");
2348 res = LLVMBuildSub(builder, res,
2349 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
2350
2351 return res;
2352 }
2353
2354
2355 /**
2356 * Extract the mantissa of the a floating.
2357 *
2358 * Result is a floating point value with
2359 *
2360 * x / floor(log2(x))
2361 */
2362 LLVMValueRef
2363 lp_build_extract_mantissa(struct lp_build_context *bld,
2364 LLVMValueRef x)
2365 {
2366 LLVMBuilderRef builder = bld->gallivm->builder;
2367 const struct lp_type type = bld->type;
2368 unsigned mantissa = lp_mantissa(type);
2369 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
2370 (1ULL << mantissa) - 1);
2371 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
2372 LLVMValueRef res;
2373
2374 assert(lp_check_value(bld->type, x));
2375
2376 assert(type.floating);
2377
2378 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
2379
2380 /* res = x / 2**ipart */
2381 res = LLVMBuildAnd(builder, x, mantmask, "");
2382 res = LLVMBuildOr(builder, res, one, "");
2383 res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
2384
2385 return res;
2386 }
2387
2388
2389
2390 /**
2391 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
2392 * These coefficients can be generate with
2393 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
2394 */
2395 const double lp_build_log2_polynomial[] = {
2396 #if LOG_POLY_DEGREE == 5
2397 2.88539008148777786488L,
2398 0.961796878841293367824L,
2399 0.577058946784739859012L,
2400 0.412914355135828735411L,
2401 0.308591899232910175289L,
2402 0.352376952300281371868L,
2403 #elif LOG_POLY_DEGREE == 4
2404 2.88539009343309178325L,
2405 0.961791550404184197881L,
2406 0.577440339438736392009L,
2407 0.403343858251329912514L,
2408 0.406718052498846252698L,
2409 #elif LOG_POLY_DEGREE == 3
2410 2.88538959748872753838L,
2411 0.961932915889597772928L,
2412 0.571118517972136195241L,
2413 0.493997535084709500285L,
2414 #else
2415 #error
2416 #endif
2417 };
2418
2419 /**
2420 * See http://www.devmaster.net/forums/showthread.php?p=43580
2421 * http://en.wikipedia.org/wiki/Logarithm#Calculation
2422 * http://www.nezumi.demon.co.uk/consult/logx.htm
2423 */
2424 void
2425 lp_build_log2_approx(struct lp_build_context *bld,
2426 LLVMValueRef x,
2427 LLVMValueRef *p_exp,
2428 LLVMValueRef *p_floor_log2,
2429 LLVMValueRef *p_log2)
2430 {
2431 LLVMBuilderRef builder = bld->gallivm->builder;
2432 const struct lp_type type = bld->type;
2433 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2434 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2435
2436 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
2437 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
2438 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
2439
2440 LLVMValueRef i = NULL;
2441 LLVMValueRef y = NULL;
2442 LLVMValueRef z = NULL;
2443 LLVMValueRef exp = NULL;
2444 LLVMValueRef mant = NULL;
2445 LLVMValueRef logexp = NULL;
2446 LLVMValueRef logmant = NULL;
2447 LLVMValueRef res = NULL;
2448
2449 assert(lp_check_value(bld->type, x));
2450
2451 if(p_exp || p_floor_log2 || p_log2) {
2452 /* TODO: optimize the constant case */
2453 if (gallivm_debug & GALLIVM_DEBUG_PERF &&
2454 LLVMIsConstant(x)) {
2455 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
2456 __FUNCTION__);
2457 }
2458
2459 assert(type.floating && type.width == 32);
2460
2461 /*
2462 * We don't explicitly handle denormalized numbers. They will yield a
2463 * result in the neighbourhood of -127, which appears to be adequate
2464 * enough.
2465 */
2466
2467 i = LLVMBuildBitCast(builder, x, int_vec_type, "");
2468
2469 /* exp = (float) exponent(x) */
2470 exp = LLVMBuildAnd(builder, i, expmask, "");
2471 }
2472
2473 if(p_floor_log2 || p_log2) {
2474 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
2475 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
2476 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
2477 }
2478
2479 if(p_log2) {
2480 /* mant = 1 + (float) mantissa(x) */
2481 mant = LLVMBuildAnd(builder, i, mantmask, "");
2482 mant = LLVMBuildOr(builder, mant, one, "");
2483 mant = LLVMBuildBitCast(builder, mant, vec_type, "");
2484
2485 /* y = (mant - 1) / (mant + 1) */
2486 y = lp_build_div(bld,
2487 lp_build_sub(bld, mant, bld->one),
2488 lp_build_add(bld, mant, bld->one)
2489 );
2490
2491 /* z = y^2 */
2492 z = lp_build_mul(bld, y, y);
2493
2494 /* compute P(z) */
2495 logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
2496 Elements(lp_build_log2_polynomial));
2497
2498 /* logmant = y * P(z) */
2499 logmant = lp_build_mul(bld, y, logmant);
2500
2501 res = lp_build_add(bld, logmant, logexp);
2502 }
2503
2504 if(p_exp) {
2505 exp = LLVMBuildBitCast(builder, exp, vec_type, "");
2506 *p_exp = exp;
2507 }
2508
2509 if(p_floor_log2)
2510 *p_floor_log2 = logexp;
2511
2512 if(p_log2)
2513 *p_log2 = res;
2514 }
2515
2516
2517 LLVMValueRef
2518 lp_build_log2(struct lp_build_context *bld,
2519 LLVMValueRef x)
2520 {
2521 LLVMValueRef res;
2522 lp_build_log2_approx(bld, x, NULL, NULL, &res);
2523 return res;
2524 }
2525
2526
2527 /**
2528 * Faster (and less accurate) log2.
2529 *
2530 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
2531 *
2532 * Piece-wise linear approximation, with exact results when x is a
2533 * power of two.
2534 *
2535 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
2536 */
2537 LLVMValueRef
2538 lp_build_fast_log2(struct lp_build_context *bld,
2539 LLVMValueRef x)
2540 {
2541 LLVMBuilderRef builder = bld->gallivm->builder;
2542 LLVMValueRef ipart;
2543 LLVMValueRef fpart;
2544
2545 assert(lp_check_value(bld->type, x));
2546
2547 assert(bld->type.floating);
2548
2549 /* ipart = floor(log2(x)) - 1 */
2550 ipart = lp_build_extract_exponent(bld, x, -1);
2551 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
2552
2553 /* fpart = x / 2**ipart */
2554 fpart = lp_build_extract_mantissa(bld, x);
2555
2556 /* ipart + fpart */
2557 return LLVMBuildFAdd(builder, ipart, fpart, "");
2558 }
2559
2560
2561 /**
2562 * Fast implementation of iround(log2(x)).
2563 *
2564 * Not an approximation -- it should give accurate results all the time.
2565 */
2566 LLVMValueRef
2567 lp_build_ilog2(struct lp_build_context *bld,
2568 LLVMValueRef x)
2569 {
2570 LLVMBuilderRef builder = bld->gallivm->builder;
2571 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
2572 LLVMValueRef ipart;
2573
2574 assert(bld->type.floating);
2575
2576 assert(lp_check_value(bld->type, x));
2577
2578 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */
2579 x = LLVMBuildFMul(builder, x, sqrt2, "");
2580
2581 /* ipart = floor(log2(x) + 0.5) */
2582 ipart = lp_build_extract_exponent(bld, x, 0);
2583
2584 return ipart;
2585 }
2586
2587 LLVMValueRef
2588 lp_build_mod(struct lp_build_context *bld,
2589 LLVMValueRef x,
2590 LLVMValueRef y)
2591 {
2592 LLVMBuilderRef builder = bld->gallivm->builder;
2593 LLVMValueRef res;
2594 const struct lp_type type = bld->type;
2595
2596 assert(lp_check_value(type, x));
2597 assert(lp_check_value(type, y));
2598
2599 if (type.floating)
2600 res = LLVMBuildFRem(builder, x, y, "");
2601 else if (type.sign)
2602 res = LLVMBuildSRem(builder, x, y, "");
2603 else
2604 res = LLVMBuildURem(builder, x, y, "");
2605 return res;
2606 }