1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_debug.h"
49 #include "util/u_math.h"
51 #include "lp_bld_type.h"
52 #include "lp_bld_const.h"
53 #include "lp_bld_intr.h"
54 #include "lp_bld_arit.h"
55 #include "lp_bld_conv.h"
59 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder
,
60 union lp_type src_type
,
64 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(src_type
);
68 unsigned long long ubound
;
69 unsigned long long mask
;
73 assert(src_type
.floating
);
75 mantissa
= lp_mantissa(src_type
);
77 /* We cannot carry more bits than the mantissa */
78 n
= MIN2(mantissa
, dst_width
);
80 /* This magic coefficients will make the desired result to appear in the
81 * lowest significant bits of the mantissa.
83 ubound
= ((unsigned long long)1 << n
);
85 scale
= (double)mask
/ubound
;
86 bias
= (double)((unsigned long long)1 << (mantissa
- n
));
88 res
= LLVMBuildMul(builder
, src
, lp_build_const_uni(src_type
, scale
), "");
89 res
= LLVMBuildAdd(builder
, res
, lp_build_const_uni(src_type
, bias
), "");
90 res
= LLVMBuildBitCast(builder
, res
, int_vec_type
, "");
93 int shift
= dst_width
- n
;
94 res
= LLVMBuildShl(builder
, res
, lp_build_int_const_uni(src_type
, shift
), "");
96 /* Fill in the empty lower bits for added precision? */
100 msb
= LLVMBuildLShr(builder
, res
, lp_build_int_const_uni(src_type
, dst_width
- 1), "");
101 msb
= LLVMBuildShl(builder
, msb
, lp_build_int_const_uni(src_type
, shift
), "");
102 msb
= LLVMBuildSub(builder
, msb
, lp_build_int_const_uni(src_type
, 1), "");
103 res
= LLVMBuildOr(builder
, res
, msb
, "");
107 res
= LLVMBuildOr(builder
, res
, LLVMBuildLShr(builder
, res
, lp_build_int_const_uni(src_type
, n
), ""), "");
114 res
= LLVMBuildAnd(builder
, res
, lp_build_int_const_uni(src_type
, mask
), "");
121 * Inverse of lp_build_clamped_float_to_unsigned_norm.
124 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder
,
126 union lp_type dst_type
,
129 LLVMTypeRef vec_type
= lp_build_vec_type(dst_type
);
130 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(dst_type
);
135 unsigned long long ubound
;
136 unsigned long long mask
;
140 mantissa
= lp_mantissa(dst_type
);
142 /* We cannot carry more bits than the mantissa */
143 n
= MIN2(mantissa
, src_width
);
145 ubound
= ((unsigned long long)1 << n
);
147 scale
= (double)ubound
/mask
;
148 bias
= (double)((unsigned long long)1 << (mantissa
- n
));
152 if(src_width
> mantissa
) {
153 int shift
= src_width
- mantissa
;
154 res
= LLVMBuildLShr(builder
, res
, lp_build_int_const_uni(dst_type
, shift
), "");
157 bias_
= lp_build_const_uni(dst_type
, bias
);
159 res
= LLVMBuildOr(builder
,
161 LLVMBuildBitCast(builder
, bias_
, int_vec_type
, ""), "");
163 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
165 res
= LLVMBuildSub(builder
, res
, bias_
, "");
166 res
= LLVMBuildMul(builder
, res
, lp_build_const_uni(dst_type
, scale
), "");
173 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
176 lp_build_const_unpack_shuffle(unsigned n
, unsigned lo_hi
)
178 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
181 assert(n
<= LP_MAX_VECTOR_LENGTH
);
184 /* TODO: cache results in a static table */
186 for(i
= 0, j
= lo_hi
*n
/2; i
< n
; i
+= 2, ++j
) {
187 elems
[i
+ 0] = LLVMConstInt(LLVMInt32Type(), 0 + j
, 0);
188 elems
[i
+ 1] = LLVMConstInt(LLVMInt32Type(), n
+ j
, 0);
191 return LLVMConstVector(elems
, n
);
196 * Build shuffle vectors that match PACKxx instructions.
199 lp_build_const_pack_shuffle(unsigned n
)
201 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
204 assert(n
<= LP_MAX_VECTOR_LENGTH
);
206 /* TODO: cache results in a static table */
208 for(i
= 0; i
< n
; ++i
)
209 elems
[i
] = LLVMConstInt(LLVMInt32Type(), 2*i
, 0);
211 return LLVMConstVector(elems
, n
);
216 lp_build_expand(LLVMBuilderRef builder
,
217 union lp_type src_type
,
218 union lp_type dst_type
,
220 LLVMValueRef
*dst
, unsigned num_dsts
)
225 /* Register width must remain constant */
226 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
228 /* We must not loose or gain channels. Only precision */
229 assert(src_type
.length
== dst_type
.length
* num_dsts
);
234 while(src_type
.width
< dst_type
.width
) {
235 union lp_type new_type
= src_type
;
236 LLVMTypeRef new_vec_type
;
239 new_type
.length
/= 2;
240 new_vec_type
= lp_build_vec_type(new_type
);
242 for(i
= num_tmps
; i
--; ) {
244 LLVMValueRef shuffle_lo
;
245 LLVMValueRef shuffle_hi
;
249 zero
= lp_build_zero(src_type
);
250 shuffle_lo
= lp_build_const_unpack_shuffle(src_type
.length
, 0);
251 shuffle_hi
= lp_build_const_unpack_shuffle(src_type
.length
, 1);
253 /* PUNPCKLBW, PUNPCKHBW */
254 lo
= LLVMBuildShuffleVector(builder
, dst
[i
], zero
, shuffle_lo
, "");
255 hi
= LLVMBuildShuffleVector(builder
, dst
[i
], zero
, shuffle_hi
, "");
257 dst
[2*i
+ 0] = LLVMBuildBitCast(builder
, lo
, new_vec_type
, "");
258 dst
[2*i
+ 1] = LLVMBuildBitCast(builder
, hi
, new_vec_type
, "");
266 assert(num_tmps
== num_dsts
);
271 * Non-interleaved pack.
273 * lo = __ l0 __ l1 __ l2 __.. __ ln
274 * hi = __ h0 __ h1 __ h2 __.. __ hn
275 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
278 lp_build_pack2(LLVMBuilderRef builder
,
279 union lp_type src_type
,
280 union lp_type dst_type
,
285 LLVMTypeRef src_vec_type
= lp_build_vec_type(src_type
);
286 LLVMTypeRef dst_vec_type
= lp_build_vec_type(dst_type
);
287 LLVMValueRef shuffle
;
290 /* Register width must remain constant */
291 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
293 /* We must not loose or gain channels. Only precision */
294 assert(src_type
.length
* 2 == dst_type
.length
);
296 assert(!src_type
.floating
);
297 assert(!dst_type
.floating
);
299 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
300 if(src_type
.width
* src_type
.length
== 128) {
301 /* All X86 non-interleaved pack instructions all take signed inputs and
302 * saturate them, so saturate beforehand. */
303 if(!src_type
.sign
&& !clamped
) {
304 struct lp_build_context bld
;
305 unsigned dst_bits
= dst_type
.sign
? dst_type
.width
- 1 : dst_type
.width
;
306 LLVMValueRef dst_max
= lp_build_int_const_uni(src_type
, ((unsigned long long)1 << dst_bits
) - 1);
307 lp_build_context_init(&bld
, builder
, src_type
);
308 lo
= lp_build_min(&bld
, lo
, dst_max
);
309 hi
= lp_build_min(&bld
, hi
, dst_max
);
312 switch(src_type
.width
) {
315 res
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packssdw.128", src_vec_type
, lo
, hi
);
317 /* PACKUSDW is the only instrinsic with a consistent signature */
318 return lp_build_intrinsic_binary(builder
, "llvm.x86.sse41.packusdw", dst_vec_type
, lo
, hi
);
323 res
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packsswb.128", src_vec_type
, lo
, hi
);
325 res
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packuswb.128", src_vec_type
, lo
, hi
);
330 return LLVMGetUndef(dst_vec_type
);
334 res
= LLVMBuildBitCast(builder
, res
, dst_vec_type
, "");
339 lo
= LLVMBuildBitCast(builder
, lo
, dst_vec_type
, "");
340 hi
= LLVMBuildBitCast(builder
, hi
, dst_vec_type
, "");
342 shuffle
= lp_build_const_pack_shuffle(dst_type
.length
);
344 res
= LLVMBuildShuffleVector(builder
, lo
, hi
, shuffle
, "");
351 lp_build_trunc(LLVMBuilderRef builder
,
352 union lp_type src_type
,
353 union lp_type dst_type
,
355 const LLVMValueRef
*src
, unsigned num_srcs
)
357 LLVMValueRef tmp
[LP_MAX_VECTOR_LENGTH
];
360 /* Register width must remain constant */
361 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
363 /* We must not loose or gain channels. Only precision */
364 assert(src_type
.length
* num_srcs
== dst_type
.length
);
366 for(i
= 0; i
< num_srcs
; ++i
)
369 while(src_type
.width
> dst_type
.width
) {
370 union lp_type new_type
= src_type
;
373 new_type
.length
*= 2;
375 /* Take in consideration the sign changes only in the last step */
376 if(new_type
.width
== dst_type
.width
)
377 new_type
.sign
= dst_type
.sign
;
381 for(i
= 0; i
< num_srcs
; ++i
)
382 tmp
[i
] = lp_build_pack2(builder
, src_type
, new_type
, clamped
,
383 tmp
[2*i
+ 0], tmp
[2*i
+ 1]);
388 assert(num_srcs
== 1);
395 * Convert between two SIMD types.
397 * Converting between SIMD types of different element width poses a problem:
398 * SIMD registers have a fixed number of bits, so different element widths
399 * imply different vector lengths. Therefore we must multiplex the multiple
400 * incoming sources into a single destination vector, or demux a single incoming
401 * vector into multiple vectors.
404 lp_build_conv(LLVMBuilderRef builder
,
405 union lp_type src_type
,
406 union lp_type dst_type
,
407 const LLVMValueRef
*src
, unsigned num_srcs
,
408 LLVMValueRef
*dst
, unsigned num_dsts
)
410 union lp_type tmp_type
;
411 LLVMValueRef tmp
[LP_MAX_VECTOR_LENGTH
];
415 /* Register width must remain constant */
416 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
418 /* We must not loose or gain channels. Only precision */
419 assert(src_type
.length
* num_srcs
== dst_type
.length
* num_dsts
);
421 assert(src_type
.length
<= LP_MAX_VECTOR_LENGTH
);
422 assert(dst_type
.length
<= LP_MAX_VECTOR_LENGTH
);
425 for(i
= 0; i
< num_srcs
; ++i
)
433 if(src_type
.value
!= dst_type
.value
) {
434 struct lp_build_context bld
;
435 double src_min
= lp_const_min(src_type
);
436 double dst_min
= lp_const_min(dst_type
);
437 double src_max
= lp_const_max(src_type
);
438 double dst_max
= lp_const_max(dst_type
);
441 lp_build_context_init(&bld
, builder
, tmp_type
);
443 if(src_min
< dst_min
) {
447 thres
= lp_build_const_uni(src_type
, dst_min
);
448 for(i
= 0; i
< num_tmps
; ++i
)
449 tmp
[i
] = lp_build_max(&bld
, tmp
[i
], thres
);
452 if(src_max
> dst_max
) {
456 thres
= lp_build_const_uni(src_type
, dst_max
);
457 for(i
= 0; i
< num_tmps
; ++i
)
458 tmp
[i
] = lp_build_min(&bld
, tmp
[i
], thres
);
463 * Scale to the narrowest range
466 if(dst_type
.floating
) {
469 else if(tmp_type
.floating
) {
470 if(!dst_type
.fixed
&& !dst_type
.sign
&& dst_type
.norm
) {
471 for(i
= 0; i
< num_tmps
; ++i
) {
472 tmp
[i
] = lp_build_clamped_float_to_unsigned_norm(builder
,
477 tmp_type
.floating
= FALSE
;
480 double dst_scale
= lp_const_scale(dst_type
);
481 LLVMTypeRef tmp_vec_type
;
483 if (dst_scale
!= 1.0) {
484 LLVMValueRef scale
= lp_build_const_uni(tmp_type
, dst_scale
);
485 for(i
= 0; i
< num_tmps
; ++i
)
486 tmp
[i
] = LLVMBuildMul(builder
, tmp
[i
], scale
, "");
489 /* Use an equally sized integer for intermediate computations */
490 tmp_type
.floating
= FALSE
;
491 tmp_vec_type
= lp_build_vec_type(tmp_type
);
492 for(i
= 0; i
< num_tmps
; ++i
) {
495 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
497 tmp
[i
] = LLVMBuildFPToUI(builder
, tmp
[i
], tmp_vec_type
, "");
499 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
500 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
506 unsigned src_shift
= lp_const_shift(src_type
);
507 unsigned dst_shift
= lp_const_shift(dst_type
);
509 /* FIXME: compensate different offsets too */
510 if(src_shift
> dst_shift
) {
511 LLVMValueRef shift
= lp_build_int_const_uni(tmp_type
, src_shift
- dst_shift
);
512 for(i
= 0; i
< num_tmps
; ++i
)
514 tmp
[i
] = LLVMBuildAShr(builder
, tmp
[i
], shift
, "");
516 tmp
[i
] = LLVMBuildLShr(builder
, tmp
[i
], shift
, "");
521 * Truncate or expand bit width
524 assert(!tmp_type
.floating
|| tmp_type
.width
== dst_type
.width
);
526 if(tmp_type
.width
> dst_type
.width
) {
527 assert(num_dsts
== 1);
528 tmp
[0] = lp_build_trunc(builder
, tmp_type
, dst_type
, TRUE
, tmp
, num_tmps
);
529 tmp_type
.width
= dst_type
.width
;
530 tmp_type
.length
= dst_type
.length
;
534 if(tmp_type
.width
< dst_type
.width
) {
535 assert(num_tmps
== 1);
536 lp_build_expand(builder
, tmp_type
, dst_type
, tmp
[0], tmp
, num_dsts
);
537 tmp_type
.width
= dst_type
.width
;
538 tmp_type
.length
= dst_type
.length
;
542 assert(tmp_type
.width
== dst_type
.width
);
543 assert(tmp_type
.length
== dst_type
.length
);
544 assert(num_tmps
== num_dsts
);
547 * Scale to the widest range
550 if(src_type
.floating
) {
553 else if(!src_type
.floating
&& dst_type
.floating
) {
554 if(!src_type
.fixed
&& !src_type
.sign
&& src_type
.norm
) {
555 for(i
= 0; i
< num_tmps
; ++i
) {
556 tmp
[i
] = lp_build_unsigned_norm_to_float(builder
,
561 tmp_type
.floating
= TRUE
;
564 double src_scale
= lp_const_scale(src_type
);
565 LLVMTypeRef tmp_vec_type
;
567 /* Use an equally sized integer for intermediate computations */
568 tmp_type
.floating
= TRUE
;
569 tmp_type
.sign
= TRUE
;
570 tmp_vec_type
= lp_build_vec_type(tmp_type
);
571 for(i
= 0; i
< num_tmps
; ++i
) {
574 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
576 tmp
[i
] = LLVMBuildUIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
578 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
579 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
583 if (src_scale
!= 1.0) {
584 LLVMValueRef scale
= lp_build_const_uni(tmp_type
, 1.0/src_scale
);
585 for(i
= 0; i
< num_tmps
; ++i
)
586 tmp
[i
] = LLVMBuildMul(builder
, tmp
[i
], scale
, "");
591 unsigned src_shift
= lp_const_shift(src_type
);
592 unsigned dst_shift
= lp_const_shift(dst_type
);
594 /* FIXME: compensate different offsets too */
595 if(src_shift
< dst_shift
) {
596 LLVMValueRef shift
= lp_build_int_const_uni(tmp_type
, dst_shift
- src_shift
);
597 for(i
= 0; i
< num_tmps
; ++i
)
598 tmp
[i
] = LLVMBuildShl(builder
, tmp
[i
], shift
, "");
602 for(i
= 0; i
< num_dsts
; ++i
)
608 * Convenience wrapper around lp_build_conv for bit masks.
611 lp_build_conv_mask(LLVMBuilderRef builder
,
612 union lp_type src_type
,
613 union lp_type dst_type
,
614 const LLVMValueRef
*src
, unsigned num_srcs
,
615 LLVMValueRef
*dst
, unsigned num_dsts
)
617 /* Register width must remain constant */
618 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
620 /* We must not loose or gain channels. Only precision */
621 assert(src_type
.length
* num_srcs
== dst_type
.length
* num_dsts
);
624 * We assume all values are 0 or -1
627 src_type
.floating
= FALSE
;
628 src_type
.fixed
= FALSE
;
629 src_type
.sign
= TRUE
;
630 src_type
.norm
= FALSE
;
632 dst_type
.floating
= FALSE
;
633 dst_type
.fixed
= FALSE
;
634 dst_type
.sign
= TRUE
;
635 dst_type
.norm
= FALSE
;
638 * Truncate or expand bit width
641 if(src_type
.width
> dst_type
.width
) {
642 assert(num_dsts
== 1);
643 dst
[0] = lp_build_trunc(builder
, src_type
, dst_type
, TRUE
, src
, num_srcs
);
645 else if(src_type
.width
< dst_type
.width
) {
646 assert(num_srcs
== 1);
647 lp_build_expand(builder
, src_type
, dst_type
, src
[0], dst
, num_dsts
);
650 assert(num_srcs
== num_dsts
);
651 memcpy(dst
, src
, num_dsts
* sizeof *dst
);