llvmpipe: remove some old sampler support structs
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_arit.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_string.h"
51 #include "util/u_cpu_detect.h"
52
53 #include "lp_bld_type.h"
54 #include "lp_bld_const.h"
55 #include "lp_bld_intr.h"
56 #include "lp_bld_logic.h"
57 #include "lp_bld_debug.h"
58 #include "lp_bld_arit.h"
59
60
61 /**
62 * Generate min(a, b)
63 * No checks for special case values of a or b = 1 or 0 are done.
64 */
65 static LLVMValueRef
66 lp_build_min_simple(struct lp_build_context *bld,
67 LLVMValueRef a,
68 LLVMValueRef b)
69 {
70 const struct lp_type type = bld->type;
71 const char *intrinsic = NULL;
72 LLVMValueRef cond;
73
74 /* TODO: optimize the constant case */
75
76 if(type.width * type.length == 128) {
77 if(type.floating) {
78 if(type.width == 32 && util_cpu_caps.has_sse)
79 intrinsic = "llvm.x86.sse.min.ps";
80 if(type.width == 64 && util_cpu_caps.has_sse2)
81 intrinsic = "llvm.x86.sse2.min.pd";
82 }
83 else {
84 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
85 intrinsic = "llvm.x86.sse2.pminu.b";
86 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
87 intrinsic = "llvm.x86.sse41.pminsb";
88 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
89 intrinsic = "llvm.x86.sse41.pminuw";
90 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
91 intrinsic = "llvm.x86.sse2.pmins.w";
92 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
93 intrinsic = "llvm.x86.sse41.pminud";
94 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
95 intrinsic = "llvm.x86.sse41.pminsd";
96 }
97 }
98
99 if(intrinsic)
100 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
101
102 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
103 return lp_build_select(bld, cond, a, b);
104 }
105
106
107 /**
108 * Generate max(a, b)
109 * No checks for special case values of a or b = 1 or 0 are done.
110 */
111 static LLVMValueRef
112 lp_build_max_simple(struct lp_build_context *bld,
113 LLVMValueRef a,
114 LLVMValueRef b)
115 {
116 const struct lp_type type = bld->type;
117 const char *intrinsic = NULL;
118 LLVMValueRef cond;
119
120 /* TODO: optimize the constant case */
121
122 if(type.width * type.length == 128) {
123 if(type.floating) {
124 if(type.width == 32 && util_cpu_caps.has_sse)
125 intrinsic = "llvm.x86.sse.max.ps";
126 if(type.width == 64 && util_cpu_caps.has_sse2)
127 intrinsic = "llvm.x86.sse2.max.pd";
128 }
129 else {
130 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
131 intrinsic = "llvm.x86.sse2.pmaxu.b";
132 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
133 intrinsic = "llvm.x86.sse41.pmaxsb";
134 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
135 intrinsic = "llvm.x86.sse41.pmaxuw";
136 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
137 intrinsic = "llvm.x86.sse2.pmaxs.w";
138 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
139 intrinsic = "llvm.x86.sse41.pmaxud";
140 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
141 intrinsic = "llvm.x86.sse41.pmaxsd";
142 }
143 }
144
145 if(intrinsic)
146 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
147
148 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
149 return lp_build_select(bld, cond, a, b);
150 }
151
152
153 /**
154 * Generate 1 - a, or ~a depending on bld->type.
155 */
156 LLVMValueRef
157 lp_build_comp(struct lp_build_context *bld,
158 LLVMValueRef a)
159 {
160 const struct lp_type type = bld->type;
161
162 if(a == bld->one)
163 return bld->zero;
164 if(a == bld->zero)
165 return bld->one;
166
167 if(type.norm && !type.floating && !type.fixed && !type.sign) {
168 if(LLVMIsConstant(a))
169 return LLVMConstNot(a);
170 else
171 return LLVMBuildNot(bld->builder, a, "");
172 }
173
174 if(LLVMIsConstant(a))
175 return LLVMConstSub(bld->one, a);
176 else
177 return LLVMBuildSub(bld->builder, bld->one, a, "");
178 }
179
180
181 /**
182 * Generate a + b
183 */
184 LLVMValueRef
185 lp_build_add(struct lp_build_context *bld,
186 LLVMValueRef a,
187 LLVMValueRef b)
188 {
189 const struct lp_type type = bld->type;
190 LLVMValueRef res;
191
192 if(a == bld->zero)
193 return b;
194 if(b == bld->zero)
195 return a;
196 if(a == bld->undef || b == bld->undef)
197 return bld->undef;
198
199 if(bld->type.norm) {
200 const char *intrinsic = NULL;
201
202 if(a == bld->one || b == bld->one)
203 return bld->one;
204
205 if(util_cpu_caps.has_sse2 &&
206 type.width * type.length == 128 &&
207 !type.floating && !type.fixed) {
208 if(type.width == 8)
209 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
210 if(type.width == 16)
211 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
212 }
213
214 if(intrinsic)
215 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
216 }
217
218 if(LLVMIsConstant(a) && LLVMIsConstant(b))
219 res = LLVMConstAdd(a, b);
220 else
221 res = LLVMBuildAdd(bld->builder, a, b, "");
222
223 /* clamp to ceiling of 1.0 */
224 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
225 res = lp_build_min_simple(bld, res, bld->one);
226
227 /* XXX clamp to floor of -1 or 0??? */
228
229 return res;
230 }
231
232
233 /**
234 * Generate a - b
235 */
236 LLVMValueRef
237 lp_build_sub(struct lp_build_context *bld,
238 LLVMValueRef a,
239 LLVMValueRef b)
240 {
241 const struct lp_type type = bld->type;
242 LLVMValueRef res;
243
244 if(b == bld->zero)
245 return a;
246 if(a == bld->undef || b == bld->undef)
247 return bld->undef;
248 if(a == b)
249 return bld->zero;
250
251 if(bld->type.norm) {
252 const char *intrinsic = NULL;
253
254 if(b == bld->one)
255 return bld->zero;
256
257 if(util_cpu_caps.has_sse2 &&
258 type.width * type.length == 128 &&
259 !type.floating && !type.fixed) {
260 if(type.width == 8)
261 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
262 if(type.width == 16)
263 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
264 }
265
266 if(intrinsic)
267 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
268 }
269
270 if(LLVMIsConstant(a) && LLVMIsConstant(b))
271 res = LLVMConstSub(a, b);
272 else
273 res = LLVMBuildSub(bld->builder, a, b, "");
274
275 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
276 res = lp_build_max_simple(bld, res, bld->zero);
277
278 return res;
279 }
280
281
282 /**
283 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
284 */
285 static LLVMValueRef
286 lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
287 {
288 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
289 unsigned i, j;
290
291 assert(n <= LP_MAX_VECTOR_LENGTH);
292 assert(lo_hi < 2);
293
294 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
295 elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
296 elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
297 }
298
299 return LLVMConstVector(elems, n);
300 }
301
302
303 /**
304 * Build constant int vector of width 'n' and value 'c'.
305 */
306 static LLVMValueRef
307 lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
308 {
309 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
310 unsigned i;
311
312 assert(n <= LP_MAX_VECTOR_LENGTH);
313
314 for(i = 0; i < n; ++i)
315 elems[i] = LLVMConstInt(type, c, 0);
316
317 return LLVMConstVector(elems, n);
318 }
319
320
321 /**
322 * Normalized 8bit multiplication.
323 *
324 * - alpha plus one
325 *
326 * makes the following approximation to the division (Sree)
327 *
328 * a*b/255 ~= (a*(b + 1)) >> 256
329 *
330 * which is the fastest method that satisfies the following OpenGL criteria
331 *
332 * 0*0 = 0 and 255*255 = 255
333 *
334 * - geometric series
335 *
336 * takes the geometric series approximation to the division
337 *
338 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
339 *
340 * in this case just the first two terms to fit in 16bit arithmetic
341 *
342 * t/255 ~= (t + (t >> 8)) >> 8
343 *
344 * note that just by itself it doesn't satisfies the OpenGL criteria, as
345 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
346 * must be used
347 *
348 * - geometric series plus rounding
349 *
350 * when using a geometric series division instead of truncating the result
351 * use roundoff in the approximation (Jim Blinn)
352 *
353 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
354 *
355 * achieving the exact results
356 *
357 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
358 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
359 * @sa Michael Herf, The "double blend trick", May 2000,
360 * http://www.stereopsis.com/doubleblend.html
361 */
362 static LLVMValueRef
363 lp_build_mul_u8n(LLVMBuilderRef builder,
364 LLVMValueRef a, LLVMValueRef b)
365 {
366 static LLVMValueRef c01 = NULL;
367 static LLVMValueRef c08 = NULL;
368 static LLVMValueRef c80 = NULL;
369 LLVMValueRef ab;
370
371 if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
372 if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
373 if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
374
375 #if 0
376
377 /* a*b/255 ~= (a*(b + 1)) >> 256 */
378 b = LLVMBuildAdd(builder, b, c01, "");
379 ab = LLVMBuildMul(builder, a, b, "");
380
381 #else
382
383 /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
384 ab = LLVMBuildMul(builder, a, b, "");
385 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
386 ab = LLVMBuildAdd(builder, ab, c80, "");
387
388 #endif
389
390 ab = LLVMBuildLShr(builder, ab, c08, "");
391
392 return ab;
393 }
394
395
396 /**
397 * Generate a * b
398 */
399 LLVMValueRef
400 lp_build_mul(struct lp_build_context *bld,
401 LLVMValueRef a,
402 LLVMValueRef b)
403 {
404 const struct lp_type type = bld->type;
405
406 if(a == bld->zero)
407 return bld->zero;
408 if(a == bld->one)
409 return b;
410 if(b == bld->zero)
411 return bld->zero;
412 if(b == bld->one)
413 return a;
414 if(a == bld->undef || b == bld->undef)
415 return bld->undef;
416
417 if(!type.floating && !type.fixed && type.norm) {
418 if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
419 LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
420 LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
421 static LLVMValueRef ml = NULL;
422 static LLVMValueRef mh = NULL;
423 LLVMValueRef al, ah, bl, bh;
424 LLVMValueRef abl, abh;
425 LLVMValueRef ab;
426
427 if(!ml) ml = lp_build_unpack_shuffle(16, 0);
428 if(!mh) mh = lp_build_unpack_shuffle(16, 1);
429
430 /* PUNPCKLBW, PUNPCKHBW */
431 al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
432 bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
433 ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
434 bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
435
436 /* NOP */
437 al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
438 bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
439 ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
440 bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
441
442 /* PMULLW, PSRLW, PADDW */
443 abl = lp_build_mul_u8n(bld->builder, al, bl);
444 abh = lp_build_mul_u8n(bld->builder, ah, bh);
445
446 /* PACKUSWB */
447 ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
448
449 /* NOP */
450 ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
451
452 return ab;
453 }
454
455 /* FIXME */
456 assert(0);
457 }
458
459 if(LLVMIsConstant(a) && LLVMIsConstant(b))
460 return LLVMConstMul(a, b);
461
462 return LLVMBuildMul(bld->builder, a, b, "");
463 }
464
465
466 /**
467 * Generate a / b
468 */
469 LLVMValueRef
470 lp_build_div(struct lp_build_context *bld,
471 LLVMValueRef a,
472 LLVMValueRef b)
473 {
474 const struct lp_type type = bld->type;
475
476 if(a == bld->zero)
477 return bld->zero;
478 if(a == bld->one)
479 return lp_build_rcp(bld, b);
480 if(b == bld->zero)
481 return bld->undef;
482 if(b == bld->one)
483 return a;
484 if(a == bld->undef || b == bld->undef)
485 return bld->undef;
486
487 if(LLVMIsConstant(a) && LLVMIsConstant(b))
488 return LLVMConstFDiv(a, b);
489
490 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
491 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
492
493 return LLVMBuildFDiv(bld->builder, a, b, "");
494 }
495
496
497 LLVMValueRef
498 lp_build_lerp(struct lp_build_context *bld,
499 LLVMValueRef x,
500 LLVMValueRef v0,
501 LLVMValueRef v1)
502 {
503 return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
504 }
505
506
507 LLVMValueRef
508 lp_build_lerp_2d(struct lp_build_context *bld,
509 LLVMValueRef x,
510 LLVMValueRef y,
511 LLVMValueRef v00,
512 LLVMValueRef v01,
513 LLVMValueRef v10,
514 LLVMValueRef v11)
515 {
516 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
517 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
518 return lp_build_lerp(bld, y, v0, v1);
519 }
520
521
522 /**
523 * Generate min(a, b)
524 * Do checks for special cases.
525 */
526 LLVMValueRef
527 lp_build_min(struct lp_build_context *bld,
528 LLVMValueRef a,
529 LLVMValueRef b)
530 {
531 if(a == bld->undef || b == bld->undef)
532 return bld->undef;
533
534 if(a == b)
535 return a;
536
537 if(bld->type.norm) {
538 if(a == bld->zero || b == bld->zero)
539 return bld->zero;
540 if(a == bld->one)
541 return b;
542 if(b == bld->one)
543 return a;
544 }
545
546 return lp_build_min_simple(bld, a, b);
547 }
548
549
550 /**
551 * Generate max(a, b)
552 * Do checks for special cases.
553 */
554 LLVMValueRef
555 lp_build_max(struct lp_build_context *bld,
556 LLVMValueRef a,
557 LLVMValueRef b)
558 {
559 if(a == bld->undef || b == bld->undef)
560 return bld->undef;
561
562 if(a == b)
563 return a;
564
565 if(bld->type.norm) {
566 if(a == bld->one || b == bld->one)
567 return bld->one;
568 if(a == bld->zero)
569 return b;
570 if(b == bld->zero)
571 return a;
572 }
573
574 return lp_build_max_simple(bld, a, b);
575 }
576
577
578 /**
579 * Generate abs(a)
580 */
581 LLVMValueRef
582 lp_build_abs(struct lp_build_context *bld,
583 LLVMValueRef a)
584 {
585 const struct lp_type type = bld->type;
586 LLVMTypeRef vec_type = lp_build_vec_type(type);
587
588 if(!type.sign)
589 return a;
590
591 if(type.floating) {
592 /* Mask out the sign bit */
593 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
594 LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
595 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
596 a = LLVMBuildAnd(bld->builder, a, mask, "");
597 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
598 return a;
599 }
600
601 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
602 switch(type.width) {
603 case 8:
604 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
605 case 16:
606 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
607 case 32:
608 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
609 }
610 }
611
612 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
613 }
614
615
616 LLVMValueRef
617 lp_build_sgn(struct lp_build_context *bld,
618 LLVMValueRef a)
619 {
620 const struct lp_type type = bld->type;
621 LLVMTypeRef vec_type = lp_build_vec_type(type);
622 LLVMValueRef cond;
623 LLVMValueRef res;
624
625 /* Handle non-zero case */
626 if(!type.sign) {
627 /* if not zero then sign must be positive */
628 res = bld->one;
629 }
630 else if(type.floating) {
631 /* Take the sign bit and add it to 1 constant */
632 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
633 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
634 LLVMValueRef sign;
635 LLVMValueRef one;
636 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
637 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
638 one = LLVMConstBitCast(bld->one, int_vec_type);
639 res = LLVMBuildOr(bld->builder, sign, one, "");
640 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
641 }
642 else
643 {
644 LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
645 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
646 res = lp_build_select(bld, cond, bld->one, minus_one);
647 }
648
649 /* Handle zero */
650 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
651 res = lp_build_select(bld, cond, bld->zero, bld->one);
652
653 return res;
654 }
655
656
657 enum lp_build_round_sse41_mode
658 {
659 LP_BUILD_ROUND_SSE41_NEAREST = 0,
660 LP_BUILD_ROUND_SSE41_FLOOR = 1,
661 LP_BUILD_ROUND_SSE41_CEIL = 2,
662 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
663 };
664
665
666 static INLINE LLVMValueRef
667 lp_build_round_sse41(struct lp_build_context *bld,
668 LLVMValueRef a,
669 enum lp_build_round_sse41_mode mode)
670 {
671 const struct lp_type type = bld->type;
672 LLVMTypeRef vec_type = lp_build_vec_type(type);
673 const char *intrinsic;
674
675 assert(type.floating);
676 assert(type.width*type.length == 128);
677 assert(lp_check_value(type, a));
678 assert(util_cpu_caps.has_sse4_1);
679
680 switch(type.width) {
681 case 32:
682 intrinsic = "llvm.x86.sse41.round.ps";
683 break;
684 case 64:
685 intrinsic = "llvm.x86.sse41.round.pd";
686 break;
687 default:
688 assert(0);
689 return bld->undef;
690 }
691
692 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
693 LLVMConstInt(LLVMInt32Type(), mode, 0));
694 }
695
696
697 LLVMValueRef
698 lp_build_trunc(struct lp_build_context *bld,
699 LLVMValueRef a)
700 {
701 const struct lp_type type = bld->type;
702
703 assert(type.floating);
704 assert(lp_check_value(type, a));
705
706 if(util_cpu_caps.has_sse4_1)
707 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
708 else {
709 LLVMTypeRef vec_type = lp_build_vec_type(type);
710 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
711 LLVMValueRef res;
712 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
713 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
714 return res;
715 }
716 }
717
718
719 LLVMValueRef
720 lp_build_round(struct lp_build_context *bld,
721 LLVMValueRef a)
722 {
723 const struct lp_type type = bld->type;
724
725 assert(type.floating);
726 assert(lp_check_value(type, a));
727
728 if(util_cpu_caps.has_sse4_1)
729 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
730 else {
731 LLVMTypeRef vec_type = lp_build_vec_type(type);
732 LLVMValueRef res;
733 res = lp_build_iround(bld, a);
734 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
735 return res;
736 }
737 }
738
739
740 LLVMValueRef
741 lp_build_floor(struct lp_build_context *bld,
742 LLVMValueRef a)
743 {
744 const struct lp_type type = bld->type;
745
746 assert(type.floating);
747
748 if(util_cpu_caps.has_sse4_1)
749 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
750 else {
751 LLVMTypeRef vec_type = lp_build_vec_type(type);
752 LLVMValueRef res;
753 res = lp_build_ifloor(bld, a);
754 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
755 return res;
756 }
757 }
758
759
760 LLVMValueRef
761 lp_build_ceil(struct lp_build_context *bld,
762 LLVMValueRef a)
763 {
764 const struct lp_type type = bld->type;
765
766 assert(type.floating);
767 assert(lp_check_value(type, a));
768
769 if(util_cpu_caps.has_sse4_1)
770 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
771 else {
772 LLVMTypeRef vec_type = lp_build_vec_type(type);
773 LLVMValueRef res;
774 res = lp_build_iceil(bld, a);
775 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
776 return res;
777 }
778 }
779
780
781 /**
782 * Convert to integer, through whichever rounding method that's fastest,
783 * typically truncating to zero.
784 */
785 LLVMValueRef
786 lp_build_itrunc(struct lp_build_context *bld,
787 LLVMValueRef a)
788 {
789 const struct lp_type type = bld->type;
790 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
791
792 assert(type.floating);
793 assert(lp_check_value(type, a));
794
795 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
796 }
797
798
799 LLVMValueRef
800 lp_build_iround(struct lp_build_context *bld,
801 LLVMValueRef a)
802 {
803 const struct lp_type type = bld->type;
804 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
805 LLVMValueRef res;
806
807 assert(type.floating);
808 assert(lp_check_value(type, a));
809
810 if(util_cpu_caps.has_sse4_1) {
811 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
812 }
813 else {
814 LLVMTypeRef vec_type = lp_build_vec_type(type);
815 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
816 LLVMValueRef sign;
817 LLVMValueRef half;
818
819 /* get sign bit */
820 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
821 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
822
823 /* sign * 0.5 */
824 half = lp_build_const_scalar(type, 0.5);
825 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
826 half = LLVMBuildOr(bld->builder, sign, half, "");
827 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
828
829 res = LLVMBuildAdd(bld->builder, a, half, "");
830 }
831
832 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
833
834 return res;
835 }
836
837
838 LLVMValueRef
839 lp_build_ifloor(struct lp_build_context *bld,
840 LLVMValueRef a)
841 {
842 const struct lp_type type = bld->type;
843 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
844 LLVMValueRef res;
845
846 assert(type.floating);
847 assert(lp_check_value(type, a));
848
849 if(util_cpu_caps.has_sse4_1) {
850 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
851 }
852 else {
853 /* Take the sign bit and add it to 1 constant */
854 LLVMTypeRef vec_type = lp_build_vec_type(type);
855 unsigned mantissa = lp_mantissa(type);
856 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
857 LLVMValueRef sign;
858 LLVMValueRef offset;
859
860 /* sign = a < 0 ? ~0 : 0 */
861 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
862 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
863 sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
864
865 /* offset = -0.99999(9)f */
866 offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
867 offset = LLVMConstBitCast(offset, int_vec_type);
868
869 /* offset = a < 0 ? -0.99999(9)f : 0.0f */
870 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
871 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
872
873 res = LLVMBuildAdd(bld->builder, a, offset, "");
874 }
875
876 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
877
878 return res;
879 }
880
881
882 LLVMValueRef
883 lp_build_iceil(struct lp_build_context *bld,
884 LLVMValueRef a)
885 {
886 const struct lp_type type = bld->type;
887 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
888 LLVMValueRef res;
889
890 assert(type.floating);
891 assert(lp_check_value(type, a));
892
893 if(util_cpu_caps.has_sse4_1) {
894 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
895 }
896 else {
897 assert(0);
898 res = bld->undef;
899 }
900
901 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
902
903 return res;
904 }
905
906
907 LLVMValueRef
908 lp_build_sqrt(struct lp_build_context *bld,
909 LLVMValueRef a)
910 {
911 const struct lp_type type = bld->type;
912 LLVMTypeRef vec_type = lp_build_vec_type(type);
913 char intrinsic[32];
914
915 /* TODO: optimize the constant case */
916 /* TODO: optimize the constant case */
917
918 assert(type.floating);
919 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
920
921 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
922 }
923
924
925 LLVMValueRef
926 lp_build_rcp(struct lp_build_context *bld,
927 LLVMValueRef a)
928 {
929 const struct lp_type type = bld->type;
930
931 if(a == bld->zero)
932 return bld->undef;
933 if(a == bld->one)
934 return bld->one;
935 if(a == bld->undef)
936 return bld->undef;
937
938 assert(type.floating);
939
940 if(LLVMIsConstant(a))
941 return LLVMConstFDiv(bld->one, a);
942
943 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
944 /* FIXME: improve precision */
945 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
946
947 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
948 }
949
950
951 /**
952 * Generate 1/sqrt(a)
953 */
954 LLVMValueRef
955 lp_build_rsqrt(struct lp_build_context *bld,
956 LLVMValueRef a)
957 {
958 const struct lp_type type = bld->type;
959
960 assert(type.floating);
961
962 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
963 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
964
965 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
966 }
967
968
969 /**
970 * Generate cos(a)
971 */
972 LLVMValueRef
973 lp_build_cos(struct lp_build_context *bld,
974 LLVMValueRef a)
975 {
976 const struct lp_type type = bld->type;
977 LLVMTypeRef vec_type = lp_build_vec_type(type);
978 char intrinsic[32];
979
980 /* TODO: optimize the constant case */
981
982 assert(type.floating);
983 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
984
985 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
986 }
987
988
989 /**
990 * Generate sin(a)
991 */
992 LLVMValueRef
993 lp_build_sin(struct lp_build_context *bld,
994 LLVMValueRef a)
995 {
996 const struct lp_type type = bld->type;
997 LLVMTypeRef vec_type = lp_build_vec_type(type);
998 char intrinsic[32];
999
1000 /* TODO: optimize the constant case */
1001
1002 assert(type.floating);
1003 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1004
1005 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1006 }
1007
1008
1009 /**
1010 * Generate pow(x, y)
1011 */
1012 LLVMValueRef
1013 lp_build_pow(struct lp_build_context *bld,
1014 LLVMValueRef x,
1015 LLVMValueRef y)
1016 {
1017 /* TODO: optimize the constant case */
1018 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1019 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1020 __FUNCTION__);
1021
1022 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1023 }
1024
1025
1026 /**
1027 * Generate exp(x)
1028 */
1029 LLVMValueRef
1030 lp_build_exp(struct lp_build_context *bld,
1031 LLVMValueRef x)
1032 {
1033 /* log2(e) = 1/log(2) */
1034 LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
1035
1036 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1037 }
1038
1039
1040 /**
1041 * Generate log(x)
1042 */
1043 LLVMValueRef
1044 lp_build_log(struct lp_build_context *bld,
1045 LLVMValueRef x)
1046 {
1047 /* log(2) */
1048 LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
1049
1050 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1051 }
1052
1053
1054 #define EXP_POLY_DEGREE 3
1055 #define LOG_POLY_DEGREE 5
1056
1057
1058 /**
1059 * Generate polynomial.
1060 * Ex: x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
1061 */
1062 static LLVMValueRef
1063 lp_build_polynomial(struct lp_build_context *bld,
1064 LLVMValueRef x,
1065 const double *coeffs,
1066 unsigned num_coeffs)
1067 {
1068 const struct lp_type type = bld->type;
1069 LLVMValueRef res = NULL;
1070 unsigned i;
1071
1072 /* TODO: optimize the constant case */
1073 if(LLVMIsConstant(x))
1074 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1075 __FUNCTION__);
1076
1077 for (i = num_coeffs; i--; ) {
1078 LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
1079 if(res)
1080 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1081 else
1082 res = coeff;
1083 }
1084
1085 if(res)
1086 return res;
1087 else
1088 return bld->undef;
1089 }
1090
1091
1092 /**
1093 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
1094 */
1095 const double lp_build_exp2_polynomial[] = {
1096 #if EXP_POLY_DEGREE == 5
1097 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
1098 #elif EXP_POLY_DEGREE == 4
1099 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1100 #elif EXP_POLY_DEGREE == 3
1101 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1102 #elif EXP_POLY_DEGREE == 2
1103 1.0017247, 6.5763628e-1, 3.3718944e-1
1104 #else
1105 #error
1106 #endif
1107 };
1108
1109
1110 void
1111 lp_build_exp2_approx(struct lp_build_context *bld,
1112 LLVMValueRef x,
1113 LLVMValueRef *p_exp2_int_part,
1114 LLVMValueRef *p_frac_part,
1115 LLVMValueRef *p_exp2)
1116 {
1117 const struct lp_type type = bld->type;
1118 LLVMTypeRef vec_type = lp_build_vec_type(type);
1119 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1120 LLVMValueRef ipart = NULL;
1121 LLVMValueRef fpart = NULL;
1122 LLVMValueRef expipart = NULL;
1123 LLVMValueRef expfpart = NULL;
1124 LLVMValueRef res = NULL;
1125
1126 if(p_exp2_int_part || p_frac_part || p_exp2) {
1127 /* TODO: optimize the constant case */
1128 if(LLVMIsConstant(x))
1129 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1130 __FUNCTION__);
1131
1132 assert(type.floating && type.width == 32);
1133
1134 x = lp_build_min(bld, x, lp_build_const_scalar(type, 129.0));
1135 x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
1136
1137 /* ipart = int(x - 0.5) */
1138 ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
1139 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1140
1141 /* fpart = x - ipart */
1142 fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1143 fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1144 }
1145
1146 if(p_exp2_int_part || p_exp2) {
1147 /* expipart = (float) (1 << ipart) */
1148 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
1149 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
1150 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1151 }
1152
1153 if(p_exp2) {
1154 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1155 Elements(lp_build_exp2_polynomial));
1156
1157 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1158 }
1159
1160 if(p_exp2_int_part)
1161 *p_exp2_int_part = expipart;
1162
1163 if(p_frac_part)
1164 *p_frac_part = fpart;
1165
1166 if(p_exp2)
1167 *p_exp2 = res;
1168 }
1169
1170
1171 LLVMValueRef
1172 lp_build_exp2(struct lp_build_context *bld,
1173 LLVMValueRef x)
1174 {
1175 LLVMValueRef res;
1176 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1177 return res;
1178 }
1179
1180
1181 /**
1182 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1183 * These coefficients can be generate with
1184 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1185 */
1186 const double lp_build_log2_polynomial[] = {
1187 #if LOG_POLY_DEGREE == 6
1188 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1189 #elif LOG_POLY_DEGREE == 5
1190 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1191 #elif LOG_POLY_DEGREE == 4
1192 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1193 #elif LOG_POLY_DEGREE == 3
1194 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1195 #else
1196 #error
1197 #endif
1198 };
1199
1200
1201 /**
1202 * See http://www.devmaster.net/forums/showthread.php?p=43580
1203 */
1204 void
1205 lp_build_log2_approx(struct lp_build_context *bld,
1206 LLVMValueRef x,
1207 LLVMValueRef *p_exp,
1208 LLVMValueRef *p_floor_log2,
1209 LLVMValueRef *p_log2)
1210 {
1211 const struct lp_type type = bld->type;
1212 LLVMTypeRef vec_type = lp_build_vec_type(type);
1213 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1214
1215 LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1216 LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1217 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1218
1219 LLVMValueRef i = NULL;
1220 LLVMValueRef exp = NULL;
1221 LLVMValueRef mant = NULL;
1222 LLVMValueRef logexp = NULL;
1223 LLVMValueRef logmant = NULL;
1224 LLVMValueRef res = NULL;
1225
1226 if(p_exp || p_floor_log2 || p_log2) {
1227 /* TODO: optimize the constant case */
1228 if(LLVMIsConstant(x))
1229 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1230 __FUNCTION__);
1231
1232 assert(type.floating && type.width == 32);
1233
1234 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1235
1236 /* exp = (float) exponent(x) */
1237 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1238 }
1239
1240 if(p_floor_log2 || p_log2) {
1241 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1242 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1243 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1244 }
1245
1246 if(p_log2) {
1247 /* mant = (float) mantissa(x) */
1248 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1249 mant = LLVMBuildOr(bld->builder, mant, one, "");
1250 mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
1251
1252 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1253 Elements(lp_build_log2_polynomial));
1254
1255 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1256 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
1257
1258 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1259 }
1260
1261 if(p_exp)
1262 *p_exp = exp;
1263
1264 if(p_floor_log2)
1265 *p_floor_log2 = logexp;
1266
1267 if(p_log2)
1268 *p_log2 = res;
1269 }
1270
1271
1272 LLVMValueRef
1273 lp_build_log2(struct lp_build_context *bld,
1274 LLVMValueRef x)
1275 {
1276 LLVMValueRef res;
1277 lp_build_log2_approx(bld, x, NULL, NULL, &res);
1278 return res;
1279 }