i965g: fix some reloc counts
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_string.h"
51 #include "util/u_cpu_detect.h"
52
53 #include "lp_bld_type.h"
54 #include "lp_bld_const.h"
55 #include "lp_bld_intr.h"
56 #include "lp_bld_logic.h"
57 #include "lp_bld_pack.h"
58 #include "lp_bld_debug.h"
59 #include "lp_bld_arit.h"
60
61
62 /**
63 * Generate min(a, b)
64 * No checks for special case values of a or b = 1 or 0 are done.
65 */
66 static LLVMValueRef
67 lp_build_min_simple(struct lp_build_context *bld,
68 LLVMValueRef a,
69 LLVMValueRef b)
70 {
71 const struct lp_type type = bld->type;
72 const char *intrinsic = NULL;
73 LLVMValueRef cond;
74
75 /* TODO: optimize the constant case */
76
77 if(type.width * type.length == 128) {
78 if(type.floating) {
79 if(type.width == 32 && util_cpu_caps.has_sse)
80 intrinsic = "llvm.x86.sse.min.ps";
81 if(type.width == 64 && util_cpu_caps.has_sse2)
82 intrinsic = "llvm.x86.sse2.min.pd";
83 }
84 else {
85 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
86 intrinsic = "llvm.x86.sse2.pminu.b";
87 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
88 intrinsic = "llvm.x86.sse41.pminsb";
89 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
90 intrinsic = "llvm.x86.sse41.pminuw";
91 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
92 intrinsic = "llvm.x86.sse2.pmins.w";
93 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
94 intrinsic = "llvm.x86.sse41.pminud";
95 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
96 intrinsic = "llvm.x86.sse41.pminsd";
97 }
98 }
99
100 if(intrinsic)
101 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
102
103 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
104 return lp_build_select(bld, cond, a, b);
105 }
106
107
108 /**
109 * Generate max(a, b)
110 * No checks for special case values of a or b = 1 or 0 are done.
111 */
112 static LLVMValueRef
113 lp_build_max_simple(struct lp_build_context *bld,
114 LLVMValueRef a,
115 LLVMValueRef b)
116 {
117 const struct lp_type type = bld->type;
118 const char *intrinsic = NULL;
119 LLVMValueRef cond;
120
121 /* TODO: optimize the constant case */
122
123 if(type.width * type.length == 128) {
124 if(type.floating) {
125 if(type.width == 32 && util_cpu_caps.has_sse)
126 intrinsic = "llvm.x86.sse.max.ps";
127 if(type.width == 64 && util_cpu_caps.has_sse2)
128 intrinsic = "llvm.x86.sse2.max.pd";
129 }
130 else {
131 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
132 intrinsic = "llvm.x86.sse2.pmaxu.b";
133 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
134 intrinsic = "llvm.x86.sse41.pmaxsb";
135 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
136 intrinsic = "llvm.x86.sse41.pmaxuw";
137 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
138 intrinsic = "llvm.x86.sse2.pmaxs.w";
139 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
140 intrinsic = "llvm.x86.sse41.pmaxud";
141 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
142 intrinsic = "llvm.x86.sse41.pmaxsd";
143 }
144 }
145
146 if(intrinsic)
147 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
148
149 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
150 return lp_build_select(bld, cond, a, b);
151 }
152
153
154 /**
155 * Generate 1 - a, or ~a depending on bld->type.
156 */
157 LLVMValueRef
158 lp_build_comp(struct lp_build_context *bld,
159 LLVMValueRef a)
160 {
161 const struct lp_type type = bld->type;
162
163 if(a == bld->one)
164 return bld->zero;
165 if(a == bld->zero)
166 return bld->one;
167
168 if(type.norm && !type.floating && !type.fixed && !type.sign) {
169 if(LLVMIsConstant(a))
170 return LLVMConstNot(a);
171 else
172 return LLVMBuildNot(bld->builder, a, "");
173 }
174
175 if(LLVMIsConstant(a))
176 return LLVMConstSub(bld->one, a);
177 else
178 return LLVMBuildSub(bld->builder, bld->one, a, "");
179 }
180
181
182 /**
183 * Generate a + b
184 */
185 LLVMValueRef
186 lp_build_add(struct lp_build_context *bld,
187 LLVMValueRef a,
188 LLVMValueRef b)
189 {
190 const struct lp_type type = bld->type;
191 LLVMValueRef res;
192
193 if(a == bld->zero)
194 return b;
195 if(b == bld->zero)
196 return a;
197 if(a == bld->undef || b == bld->undef)
198 return bld->undef;
199
200 if(bld->type.norm) {
201 const char *intrinsic = NULL;
202
203 if(a == bld->one || b == bld->one)
204 return bld->one;
205
206 if(util_cpu_caps.has_sse2 &&
207 type.width * type.length == 128 &&
208 !type.floating && !type.fixed) {
209 if(type.width == 8)
210 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
211 if(type.width == 16)
212 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
213 }
214
215 if(intrinsic)
216 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
217 }
218
219 if(LLVMIsConstant(a) && LLVMIsConstant(b))
220 res = LLVMConstAdd(a, b);
221 else
222 res = LLVMBuildAdd(bld->builder, a, b, "");
223
224 /* clamp to ceiling of 1.0 */
225 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
226 res = lp_build_min_simple(bld, res, bld->one);
227
228 /* XXX clamp to floor of -1 or 0??? */
229
230 return res;
231 }
232
233
234 /**
235 * Generate a - b
236 */
237 LLVMValueRef
238 lp_build_sub(struct lp_build_context *bld,
239 LLVMValueRef a,
240 LLVMValueRef b)
241 {
242 const struct lp_type type = bld->type;
243 LLVMValueRef res;
244
245 if(b == bld->zero)
246 return a;
247 if(a == bld->undef || b == bld->undef)
248 return bld->undef;
249 if(a == b)
250 return bld->zero;
251
252 if(bld->type.norm) {
253 const char *intrinsic = NULL;
254
255 if(b == bld->one)
256 return bld->zero;
257
258 if(util_cpu_caps.has_sse2 &&
259 type.width * type.length == 128 &&
260 !type.floating && !type.fixed) {
261 if(type.width == 8)
262 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
263 if(type.width == 16)
264 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
265 }
266
267 if(intrinsic)
268 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
269 }
270
271 if(LLVMIsConstant(a) && LLVMIsConstant(b))
272 res = LLVMConstSub(a, b);
273 else
274 res = LLVMBuildSub(bld->builder, a, b, "");
275
276 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
277 res = lp_build_max_simple(bld, res, bld->zero);
278
279 return res;
280 }
281
282
283 /**
284 * Normalized 8bit multiplication.
285 *
286 * - alpha plus one
287 *
288 * makes the following approximation to the division (Sree)
289 *
290 * a*b/255 ~= (a*(b + 1)) >> 256
291 *
292 * which is the fastest method that satisfies the following OpenGL criteria
293 *
294 * 0*0 = 0 and 255*255 = 255
295 *
296 * - geometric series
297 *
298 * takes the geometric series approximation to the division
299 *
300 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
301 *
302 * in this case just the first two terms to fit in 16bit arithmetic
303 *
304 * t/255 ~= (t + (t >> 8)) >> 8
305 *
306 * note that just by itself it doesn't satisfies the OpenGL criteria, as
307 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
308 * must be used
309 *
310 * - geometric series plus rounding
311 *
312 * when using a geometric series division instead of truncating the result
313 * use roundoff in the approximation (Jim Blinn)
314 *
315 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
316 *
317 * achieving the exact results
318 *
319 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
320 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
321 * @sa Michael Herf, The "double blend trick", May 2000,
322 * http://www.stereopsis.com/doubleblend.html
323 */
324 static LLVMValueRef
325 lp_build_mul_u8n(LLVMBuilderRef builder,
326 struct lp_type i16_type,
327 LLVMValueRef a, LLVMValueRef b)
328 {
329 LLVMValueRef c8;
330 LLVMValueRef ab;
331
332 c8 = lp_build_int_const_scalar(i16_type, 8);
333
334 #if 0
335
336 /* a*b/255 ~= (a*(b + 1)) >> 256 */
337 b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
338 ab = LLVMBuildMul(builder, a, b, "");
339
340 #else
341
342 /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
343 ab = LLVMBuildMul(builder, a, b, "");
344 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
345 ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
346
347 #endif
348
349 ab = LLVMBuildLShr(builder, ab, c8, "");
350
351 return ab;
352 }
353
354
355 /**
356 * Generate a * b
357 */
358 LLVMValueRef
359 lp_build_mul(struct lp_build_context *bld,
360 LLVMValueRef a,
361 LLVMValueRef b)
362 {
363 const struct lp_type type = bld->type;
364
365 if(a == bld->zero)
366 return bld->zero;
367 if(a == bld->one)
368 return b;
369 if(b == bld->zero)
370 return bld->zero;
371 if(b == bld->one)
372 return a;
373 if(a == bld->undef || b == bld->undef)
374 return bld->undef;
375
376 if(!type.floating && !type.fixed && type.norm) {
377 if(type.width == 8) {
378 struct lp_type i16_type = lp_wider_type(type);
379 LLVMValueRef al, ah, bl, bh, abl, abh, ab;
380
381 lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
382 lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
383
384 /* PMULLW, PSRLW, PADDW */
385 abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
386 abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
387
388 ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
389
390 return ab;
391 }
392
393 /* FIXME */
394 assert(0);
395 }
396
397 if(LLVMIsConstant(a) && LLVMIsConstant(b))
398 return LLVMConstMul(a, b);
399
400 return LLVMBuildMul(bld->builder, a, b, "");
401 }
402
403
404 /**
405 * Generate a / b
406 */
407 LLVMValueRef
408 lp_build_div(struct lp_build_context *bld,
409 LLVMValueRef a,
410 LLVMValueRef b)
411 {
412 const struct lp_type type = bld->type;
413
414 if(a == bld->zero)
415 return bld->zero;
416 if(a == bld->one)
417 return lp_build_rcp(bld, b);
418 if(b == bld->zero)
419 return bld->undef;
420 if(b == bld->one)
421 return a;
422 if(a == bld->undef || b == bld->undef)
423 return bld->undef;
424
425 if(LLVMIsConstant(a) && LLVMIsConstant(b))
426 return LLVMConstFDiv(a, b);
427
428 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
429 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
430
431 return LLVMBuildFDiv(bld->builder, a, b, "");
432 }
433
434
435 LLVMValueRef
436 lp_build_lerp(struct lp_build_context *bld,
437 LLVMValueRef x,
438 LLVMValueRef v0,
439 LLVMValueRef v1)
440 {
441 return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
442 }
443
444
445 LLVMValueRef
446 lp_build_lerp_2d(struct lp_build_context *bld,
447 LLVMValueRef x,
448 LLVMValueRef y,
449 LLVMValueRef v00,
450 LLVMValueRef v01,
451 LLVMValueRef v10,
452 LLVMValueRef v11)
453 {
454 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
455 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
456 return lp_build_lerp(bld, y, v0, v1);
457 }
458
459
460 /**
461 * Generate min(a, b)
462 * Do checks for special cases.
463 */
464 LLVMValueRef
465 lp_build_min(struct lp_build_context *bld,
466 LLVMValueRef a,
467 LLVMValueRef b)
468 {
469 if(a == bld->undef || b == bld->undef)
470 return bld->undef;
471
472 if(a == b)
473 return a;
474
475 if(bld->type.norm) {
476 if(a == bld->zero || b == bld->zero)
477 return bld->zero;
478 if(a == bld->one)
479 return b;
480 if(b == bld->one)
481 return a;
482 }
483
484 return lp_build_min_simple(bld, a, b);
485 }
486
487
488 /**
489 * Generate max(a, b)
490 * Do checks for special cases.
491 */
492 LLVMValueRef
493 lp_build_max(struct lp_build_context *bld,
494 LLVMValueRef a,
495 LLVMValueRef b)
496 {
497 if(a == bld->undef || b == bld->undef)
498 return bld->undef;
499
500 if(a == b)
501 return a;
502
503 if(bld->type.norm) {
504 if(a == bld->one || b == bld->one)
505 return bld->one;
506 if(a == bld->zero)
507 return b;
508 if(b == bld->zero)
509 return a;
510 }
511
512 return lp_build_max_simple(bld, a, b);
513 }
514
515
516 /**
517 * Generate abs(a)
518 */
519 LLVMValueRef
520 lp_build_abs(struct lp_build_context *bld,
521 LLVMValueRef a)
522 {
523 const struct lp_type type = bld->type;
524 LLVMTypeRef vec_type = lp_build_vec_type(type);
525
526 if(!type.sign)
527 return a;
528
529 if(type.floating) {
530 /* Mask out the sign bit */
531 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
532 LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
533 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
534 a = LLVMBuildAnd(bld->builder, a, mask, "");
535 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
536 return a;
537 }
538
539 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
540 switch(type.width) {
541 case 8:
542 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
543 case 16:
544 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
545 case 32:
546 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
547 }
548 }
549
550 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
551 }
552
553
554 LLVMValueRef
555 lp_build_sgn(struct lp_build_context *bld,
556 LLVMValueRef a)
557 {
558 const struct lp_type type = bld->type;
559 LLVMTypeRef vec_type = lp_build_vec_type(type);
560 LLVMValueRef cond;
561 LLVMValueRef res;
562
563 /* Handle non-zero case */
564 if(!type.sign) {
565 /* if not zero then sign must be positive */
566 res = bld->one;
567 }
568 else if(type.floating) {
569 /* Take the sign bit and add it to 1 constant */
570 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
571 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
572 LLVMValueRef sign;
573 LLVMValueRef one;
574 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
575 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
576 one = LLVMConstBitCast(bld->one, int_vec_type);
577 res = LLVMBuildOr(bld->builder, sign, one, "");
578 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
579 }
580 else
581 {
582 LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
583 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
584 res = lp_build_select(bld, cond, bld->one, minus_one);
585 }
586
587 /* Handle zero */
588 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
589 res = lp_build_select(bld, cond, bld->zero, bld->one);
590
591 return res;
592 }
593
594
595 enum lp_build_round_sse41_mode
596 {
597 LP_BUILD_ROUND_SSE41_NEAREST = 0,
598 LP_BUILD_ROUND_SSE41_FLOOR = 1,
599 LP_BUILD_ROUND_SSE41_CEIL = 2,
600 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
601 };
602
603
604 static INLINE LLVMValueRef
605 lp_build_round_sse41(struct lp_build_context *bld,
606 LLVMValueRef a,
607 enum lp_build_round_sse41_mode mode)
608 {
609 const struct lp_type type = bld->type;
610 LLVMTypeRef vec_type = lp_build_vec_type(type);
611 const char *intrinsic;
612
613 assert(type.floating);
614 assert(type.width*type.length == 128);
615 assert(lp_check_value(type, a));
616 assert(util_cpu_caps.has_sse4_1);
617
618 switch(type.width) {
619 case 32:
620 intrinsic = "llvm.x86.sse41.round.ps";
621 break;
622 case 64:
623 intrinsic = "llvm.x86.sse41.round.pd";
624 break;
625 default:
626 assert(0);
627 return bld->undef;
628 }
629
630 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
631 LLVMConstInt(LLVMInt32Type(), mode, 0));
632 }
633
634
635 LLVMValueRef
636 lp_build_trunc(struct lp_build_context *bld,
637 LLVMValueRef a)
638 {
639 const struct lp_type type = bld->type;
640
641 assert(type.floating);
642 assert(lp_check_value(type, a));
643
644 if(util_cpu_caps.has_sse4_1)
645 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
646 else {
647 LLVMTypeRef vec_type = lp_build_vec_type(type);
648 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
649 LLVMValueRef res;
650 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
651 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
652 return res;
653 }
654 }
655
656
657 LLVMValueRef
658 lp_build_round(struct lp_build_context *bld,
659 LLVMValueRef a)
660 {
661 const struct lp_type type = bld->type;
662
663 assert(type.floating);
664 assert(lp_check_value(type, a));
665
666 if(util_cpu_caps.has_sse4_1)
667 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
668 else {
669 LLVMTypeRef vec_type = lp_build_vec_type(type);
670 LLVMValueRef res;
671 res = lp_build_iround(bld, a);
672 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
673 return res;
674 }
675 }
676
677
678 LLVMValueRef
679 lp_build_floor(struct lp_build_context *bld,
680 LLVMValueRef a)
681 {
682 const struct lp_type type = bld->type;
683
684 assert(type.floating);
685
686 if(util_cpu_caps.has_sse4_1)
687 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
688 else {
689 LLVMTypeRef vec_type = lp_build_vec_type(type);
690 LLVMValueRef res;
691 res = lp_build_ifloor(bld, a);
692 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
693 return res;
694 }
695 }
696
697
698 LLVMValueRef
699 lp_build_ceil(struct lp_build_context *bld,
700 LLVMValueRef a)
701 {
702 const struct lp_type type = bld->type;
703
704 assert(type.floating);
705 assert(lp_check_value(type, a));
706
707 if(util_cpu_caps.has_sse4_1)
708 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
709 else {
710 LLVMTypeRef vec_type = lp_build_vec_type(type);
711 LLVMValueRef res;
712 res = lp_build_iceil(bld, a);
713 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
714 return res;
715 }
716 }
717
718
719 /**
720 * Convert to integer, through whichever rounding method that's fastest,
721 * typically truncating to zero.
722 */
723 LLVMValueRef
724 lp_build_itrunc(struct lp_build_context *bld,
725 LLVMValueRef a)
726 {
727 const struct lp_type type = bld->type;
728 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
729
730 assert(type.floating);
731 assert(lp_check_value(type, a));
732
733 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
734 }
735
736
737 LLVMValueRef
738 lp_build_iround(struct lp_build_context *bld,
739 LLVMValueRef a)
740 {
741 const struct lp_type type = bld->type;
742 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
743 LLVMValueRef res;
744
745 assert(type.floating);
746 assert(lp_check_value(type, a));
747
748 if(util_cpu_caps.has_sse4_1) {
749 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
750 }
751 else {
752 LLVMTypeRef vec_type = lp_build_vec_type(type);
753 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
754 LLVMValueRef sign;
755 LLVMValueRef half;
756
757 /* get sign bit */
758 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
759 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
760
761 /* sign * 0.5 */
762 half = lp_build_const_scalar(type, 0.5);
763 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
764 half = LLVMBuildOr(bld->builder, sign, half, "");
765 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
766
767 res = LLVMBuildAdd(bld->builder, a, half, "");
768 }
769
770 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
771
772 return res;
773 }
774
775
776 LLVMValueRef
777 lp_build_ifloor(struct lp_build_context *bld,
778 LLVMValueRef a)
779 {
780 const struct lp_type type = bld->type;
781 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
782 LLVMValueRef res;
783
784 assert(type.floating);
785 assert(lp_check_value(type, a));
786
787 if(util_cpu_caps.has_sse4_1) {
788 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
789 }
790 else {
791 /* Take the sign bit and add it to 1 constant */
792 LLVMTypeRef vec_type = lp_build_vec_type(type);
793 unsigned mantissa = lp_mantissa(type);
794 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
795 LLVMValueRef sign;
796 LLVMValueRef offset;
797
798 /* sign = a < 0 ? ~0 : 0 */
799 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
800 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
801 sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
802
803 /* offset = -0.99999(9)f */
804 offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
805 offset = LLVMConstBitCast(offset, int_vec_type);
806
807 /* offset = a < 0 ? -0.99999(9)f : 0.0f */
808 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
809 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
810
811 res = LLVMBuildAdd(bld->builder, a, offset, "");
812 }
813
814 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
815
816 return res;
817 }
818
819
820 LLVMValueRef
821 lp_build_iceil(struct lp_build_context *bld,
822 LLVMValueRef a)
823 {
824 const struct lp_type type = bld->type;
825 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
826 LLVMValueRef res;
827
828 assert(type.floating);
829 assert(lp_check_value(type, a));
830
831 if(util_cpu_caps.has_sse4_1) {
832 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
833 }
834 else {
835 assert(0);
836 res = bld->undef;
837 }
838
839 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
840
841 return res;
842 }
843
844
845 LLVMValueRef
846 lp_build_sqrt(struct lp_build_context *bld,
847 LLVMValueRef a)
848 {
849 const struct lp_type type = bld->type;
850 LLVMTypeRef vec_type = lp_build_vec_type(type);
851 char intrinsic[32];
852
853 /* TODO: optimize the constant case */
854 /* TODO: optimize the constant case */
855
856 assert(type.floating);
857 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
858
859 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
860 }
861
862
863 LLVMValueRef
864 lp_build_rcp(struct lp_build_context *bld,
865 LLVMValueRef a)
866 {
867 const struct lp_type type = bld->type;
868
869 if(a == bld->zero)
870 return bld->undef;
871 if(a == bld->one)
872 return bld->one;
873 if(a == bld->undef)
874 return bld->undef;
875
876 assert(type.floating);
877
878 if(LLVMIsConstant(a))
879 return LLVMConstFDiv(bld->one, a);
880
881 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
882 /* FIXME: improve precision */
883 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
884
885 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
886 }
887
888
889 /**
890 * Generate 1/sqrt(a)
891 */
892 LLVMValueRef
893 lp_build_rsqrt(struct lp_build_context *bld,
894 LLVMValueRef a)
895 {
896 const struct lp_type type = bld->type;
897
898 assert(type.floating);
899
900 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
901 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
902
903 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
904 }
905
906
907 /**
908 * Generate cos(a)
909 */
910 LLVMValueRef
911 lp_build_cos(struct lp_build_context *bld,
912 LLVMValueRef a)
913 {
914 const struct lp_type type = bld->type;
915 LLVMTypeRef vec_type = lp_build_vec_type(type);
916 char intrinsic[32];
917
918 /* TODO: optimize the constant case */
919
920 assert(type.floating);
921 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
922
923 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
924 }
925
926
927 /**
928 * Generate sin(a)
929 */
930 LLVMValueRef
931 lp_build_sin(struct lp_build_context *bld,
932 LLVMValueRef a)
933 {
934 const struct lp_type type = bld->type;
935 LLVMTypeRef vec_type = lp_build_vec_type(type);
936 char intrinsic[32];
937
938 /* TODO: optimize the constant case */
939
940 assert(type.floating);
941 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
942
943 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
944 }
945
946
947 /**
948 * Generate pow(x, y)
949 */
950 LLVMValueRef
951 lp_build_pow(struct lp_build_context *bld,
952 LLVMValueRef x,
953 LLVMValueRef y)
954 {
955 /* TODO: optimize the constant case */
956 if(LLVMIsConstant(x) && LLVMIsConstant(y))
957 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
958 __FUNCTION__);
959
960 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
961 }
962
963
964 /**
965 * Generate exp(x)
966 */
967 LLVMValueRef
968 lp_build_exp(struct lp_build_context *bld,
969 LLVMValueRef x)
970 {
971 /* log2(e) = 1/log(2) */
972 LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
973
974 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
975 }
976
977
978 /**
979 * Generate log(x)
980 */
981 LLVMValueRef
982 lp_build_log(struct lp_build_context *bld,
983 LLVMValueRef x)
984 {
985 /* log(2) */
986 LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
987
988 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
989 }
990
991
992 #define EXP_POLY_DEGREE 3
993 #define LOG_POLY_DEGREE 5
994
995
996 /**
997 * Generate polynomial.
998 * Ex: x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
999 */
1000 static LLVMValueRef
1001 lp_build_polynomial(struct lp_build_context *bld,
1002 LLVMValueRef x,
1003 const double *coeffs,
1004 unsigned num_coeffs)
1005 {
1006 const struct lp_type type = bld->type;
1007 LLVMValueRef res = NULL;
1008 unsigned i;
1009
1010 /* TODO: optimize the constant case */
1011 if(LLVMIsConstant(x))
1012 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1013 __FUNCTION__);
1014
1015 for (i = num_coeffs; i--; ) {
1016 LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
1017 if(res)
1018 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1019 else
1020 res = coeff;
1021 }
1022
1023 if(res)
1024 return res;
1025 else
1026 return bld->undef;
1027 }
1028
1029
1030 /**
1031 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
1032 */
1033 const double lp_build_exp2_polynomial[] = {
1034 #if EXP_POLY_DEGREE == 5
1035 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
1036 #elif EXP_POLY_DEGREE == 4
1037 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1038 #elif EXP_POLY_DEGREE == 3
1039 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1040 #elif EXP_POLY_DEGREE == 2
1041 1.0017247, 6.5763628e-1, 3.3718944e-1
1042 #else
1043 #error
1044 #endif
1045 };
1046
1047
1048 void
1049 lp_build_exp2_approx(struct lp_build_context *bld,
1050 LLVMValueRef x,
1051 LLVMValueRef *p_exp2_int_part,
1052 LLVMValueRef *p_frac_part,
1053 LLVMValueRef *p_exp2)
1054 {
1055 const struct lp_type type = bld->type;
1056 LLVMTypeRef vec_type = lp_build_vec_type(type);
1057 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1058 LLVMValueRef ipart = NULL;
1059 LLVMValueRef fpart = NULL;
1060 LLVMValueRef expipart = NULL;
1061 LLVMValueRef expfpart = NULL;
1062 LLVMValueRef res = NULL;
1063
1064 if(p_exp2_int_part || p_frac_part || p_exp2) {
1065 /* TODO: optimize the constant case */
1066 if(LLVMIsConstant(x))
1067 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1068 __FUNCTION__);
1069
1070 assert(type.floating && type.width == 32);
1071
1072 x = lp_build_min(bld, x, lp_build_const_scalar(type, 129.0));
1073 x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
1074
1075 /* ipart = int(x - 0.5) */
1076 ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
1077 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1078
1079 /* fpart = x - ipart */
1080 fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1081 fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1082 }
1083
1084 if(p_exp2_int_part || p_exp2) {
1085 /* expipart = (float) (1 << ipart) */
1086 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
1087 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
1088 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1089 }
1090
1091 if(p_exp2) {
1092 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1093 Elements(lp_build_exp2_polynomial));
1094
1095 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1096 }
1097
1098 if(p_exp2_int_part)
1099 *p_exp2_int_part = expipart;
1100
1101 if(p_frac_part)
1102 *p_frac_part = fpart;
1103
1104 if(p_exp2)
1105 *p_exp2 = res;
1106 }
1107
1108
1109 LLVMValueRef
1110 lp_build_exp2(struct lp_build_context *bld,
1111 LLVMValueRef x)
1112 {
1113 LLVMValueRef res;
1114 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1115 return res;
1116 }
1117
1118
1119 /**
1120 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1121 * These coefficients can be generate with
1122 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1123 */
1124 const double lp_build_log2_polynomial[] = {
1125 #if LOG_POLY_DEGREE == 6
1126 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1127 #elif LOG_POLY_DEGREE == 5
1128 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1129 #elif LOG_POLY_DEGREE == 4
1130 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1131 #elif LOG_POLY_DEGREE == 3
1132 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1133 #else
1134 #error
1135 #endif
1136 };
1137
1138
1139 /**
1140 * See http://www.devmaster.net/forums/showthread.php?p=43580
1141 */
1142 void
1143 lp_build_log2_approx(struct lp_build_context *bld,
1144 LLVMValueRef x,
1145 LLVMValueRef *p_exp,
1146 LLVMValueRef *p_floor_log2,
1147 LLVMValueRef *p_log2)
1148 {
1149 const struct lp_type type = bld->type;
1150 LLVMTypeRef vec_type = lp_build_vec_type(type);
1151 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1152
1153 LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1154 LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1155 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1156
1157 LLVMValueRef i = NULL;
1158 LLVMValueRef exp = NULL;
1159 LLVMValueRef mant = NULL;
1160 LLVMValueRef logexp = NULL;
1161 LLVMValueRef logmant = NULL;
1162 LLVMValueRef res = NULL;
1163
1164 if(p_exp || p_floor_log2 || p_log2) {
1165 /* TODO: optimize the constant case */
1166 if(LLVMIsConstant(x))
1167 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1168 __FUNCTION__);
1169
1170 assert(type.floating && type.width == 32);
1171
1172 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1173
1174 /* exp = (float) exponent(x) */
1175 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1176 }
1177
1178 if(p_floor_log2 || p_log2) {
1179 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1180 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1181 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1182 }
1183
1184 if(p_log2) {
1185 /* mant = (float) mantissa(x) */
1186 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1187 mant = LLVMBuildOr(bld->builder, mant, one, "");
1188 mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
1189
1190 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1191 Elements(lp_build_log2_polynomial));
1192
1193 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1194 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
1195
1196 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1197 }
1198
1199 if(p_exp)
1200 *p_exp = exp;
1201
1202 if(p_floor_log2)
1203 *p_floor_log2 = logexp;
1204
1205 if(p_log2)
1206 *p_log2 = res;
1207 }
1208
1209
1210 LLVMValueRef
1211 lp_build_log2(struct lp_build_context *bld,
1212 LLVMValueRef x)
1213 {
1214 LLVMValueRef res;
1215 lp_build_log2_approx(bld, x, NULL, NULL, &res);
1216 return res;
1217 }