1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_debug.h"
49 #include "util/u_math.h"
51 #include "lp_bld_type.h"
52 #include "lp_bld_const.h"
53 #include "lp_bld_intr.h"
54 #include "lp_bld_arit.h"
55 #include "lp_bld_conv.h"
59 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder
,
60 union lp_type src_type
,
64 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(src_type
);
68 unsigned long long ubound
;
69 unsigned long long mask
;
73 assert(src_type
.floating
);
75 mantissa
= lp_mantissa(src_type
);
77 /* We cannot carry more bits than the mantissa */
78 n
= MIN2(mantissa
, dst_width
);
80 /* This magic coefficients will make the desired result to appear in the
81 * lowest significant bits of the mantissa.
83 ubound
= ((unsigned long long)1 << n
);
85 scale
= (double)mask
/ubound
;
86 bias
= (double)((unsigned long long)1 << (mantissa
- n
));
88 res
= LLVMBuildMul(builder
, src
, lp_build_const_uni(src_type
, scale
), "");
89 res
= LLVMBuildAdd(builder
, res
, lp_build_const_uni(src_type
, bias
), "");
90 res
= LLVMBuildBitCast(builder
, res
, int_vec_type
, "");
93 int shift
= dst_width
- n
;
94 res
= LLVMBuildShl(builder
, res
, lp_build_int_const_uni(src_type
, shift
), "");
96 /* Fill in the empty lower bits for added precision? */
100 msb
= LLVMBuildLShr(builder
, res
, lp_build_int_const_uni(src_type
, dst_width
- 1), "");
101 msb
= LLVMBuildShl(builder
, msb
, lp_build_int_const_uni(src_type
, shift
), "");
102 msb
= LLVMBuildSub(builder
, msb
, lp_build_int_const_uni(src_type
, 1), "");
103 res
= LLVMBuildOr(builder
, res
, msb
, "");
107 res
= LLVMBuildOr(builder
, res
, LLVMBuildLShr(builder
, res
, lp_build_int_const_uni(src_type
, n
), ""), "");
114 res
= LLVMBuildAnd(builder
, res
, lp_build_int_const_uni(src_type
, mask
), "");
121 * Inverse of lp_build_clamped_float_to_unsigned_norm.
124 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder
,
126 union lp_type dst_type
,
129 LLVMTypeRef vec_type
= lp_build_vec_type(dst_type
);
130 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(dst_type
);
135 unsigned long long ubound
;
136 unsigned long long mask
;
140 mantissa
= lp_mantissa(dst_type
);
142 /* We cannot carry more bits than the mantissa */
143 n
= MIN2(mantissa
, src_width
);
145 ubound
= ((unsigned long long)1 << n
);
147 scale
= (double)ubound
/mask
;
148 bias
= (double)((unsigned long long)1 << (mantissa
- n
));
152 if(src_width
> mantissa
) {
153 int shift
= src_width
- mantissa
;
154 res
= LLVMBuildLShr(builder
, res
, lp_build_int_const_uni(dst_type
, shift
), "");
157 bias_
= lp_build_const_uni(dst_type
, bias
);
159 res
= LLVMBuildOr(builder
,
161 LLVMBuildBitCast(builder
, bias_
, int_vec_type
, ""), "");
163 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
165 res
= LLVMBuildSub(builder
, res
, bias_
, "");
166 res
= LLVMBuildMul(builder
, res
, lp_build_const_uni(dst_type
, scale
), "");
173 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
176 lp_build_const_unpack_shuffle(unsigned n
, unsigned lo_hi
)
178 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
181 assert(n
<= LP_MAX_VECTOR_LENGTH
);
184 /* TODO: cache results in a static table */
186 for(i
= 0, j
= lo_hi
*n
/2; i
< n
; i
+= 2, ++j
) {
187 elems
[i
+ 0] = LLVMConstInt(LLVMInt32Type(), 0 + j
, 0);
188 elems
[i
+ 1] = LLVMConstInt(LLVMInt32Type(), n
+ j
, 0);
191 return LLVMConstVector(elems
, n
);
196 * Build shuffle vectors that match PACKxx instructions.
199 lp_build_const_pack_shuffle(unsigned n
)
201 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
204 assert(n
<= LP_MAX_VECTOR_LENGTH
);
206 /* TODO: cache results in a static table */
208 for(i
= 0; i
< n
; ++i
)
209 elems
[i
] = LLVMConstInt(LLVMInt32Type(), 2*i
, 0);
211 return LLVMConstVector(elems
, n
);
216 lp_build_expand(LLVMBuilderRef builder
,
217 union lp_type src_type
,
218 union lp_type dst_type
,
220 LLVMValueRef
*dst
, unsigned num_dsts
)
225 /* Register width must remain constant */
226 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
228 /* We must not loose or gain channels. Only precision */
229 assert(src_type
.length
== dst_type
.length
* num_dsts
);
234 while(src_type
.width
< dst_type
.width
) {
235 union lp_type new_type
= src_type
;
236 LLVMTypeRef new_vec_type
;
239 new_type
.length
/= 2;
240 new_vec_type
= lp_build_vec_type(new_type
);
242 for(i
= num_tmps
; i
--; ) {
244 LLVMValueRef shuffle_lo
;
245 LLVMValueRef shuffle_hi
;
249 zero
= lp_build_zero(src_type
);
250 shuffle_lo
= lp_build_const_unpack_shuffle(src_type
.length
, 0);
251 shuffle_hi
= lp_build_const_unpack_shuffle(src_type
.length
, 1);
253 /* PUNPCKLBW, PUNPCKHBW */
254 lo
= LLVMBuildShuffleVector(builder
, dst
[i
], zero
, shuffle_lo
, "");
255 hi
= LLVMBuildShuffleVector(builder
, dst
[i
], zero
, shuffle_hi
, "");
257 dst
[2*i
+ 0] = LLVMBuildBitCast(builder
, lo
, new_vec_type
, "");
258 dst
[2*i
+ 1] = LLVMBuildBitCast(builder
, hi
, new_vec_type
, "");
266 assert(num_tmps
== num_dsts
);
271 * Non-interleaved pack.
273 * lo = __ l0 __ l1 __ l2 __.. __ ln
274 * hi = __ h0 __ h1 __ h2 __.. __ hn
275 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
278 lp_build_pack2(LLVMBuilderRef builder
,
279 union lp_type src_type
,
280 union lp_type dst_type
,
284 LLVMTypeRef src_vec_type
= lp_build_vec_type(src_type
);
285 LLVMTypeRef dst_vec_type
= lp_build_vec_type(dst_type
);
286 LLVMValueRef shuffle
;
289 /* Register width must remain constant */
290 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
292 /* We must not loose or gain channels. Only precision */
293 assert(src_type
.length
* 2 == dst_type
.length
);
295 assert(!src_type
.floating
);
296 assert(!dst_type
.floating
);
298 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
299 if(src_type
.width
* src_type
.length
== 128) {
300 /* All X86 non-interleaved pack instructions all take signed inputs and
301 * saturate them, so saturate beforehand. */
303 struct lp_build_context bld
;
304 unsigned dst_bits
= dst_type
.sign
? dst_type
.width
- 1 : dst_type
.width
;
305 LLVMValueRef dst_max
= lp_build_int_const_uni(src_type
, ((unsigned long long)1 << dst_bits
) - 1);
306 lp_build_context_init(&bld
, builder
, src_type
);
307 lo
= lp_build_min(&bld
, lo
, dst_max
);
308 hi
= lp_build_min(&bld
, hi
, dst_max
);
311 switch(src_type
.width
) {
314 res
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packssdw.128", src_vec_type
, lo
, hi
);
316 /* PACKUSDW is the only instrinsic with a consistent signature */
317 return lp_build_intrinsic_binary(builder
, "llvm.x86.sse41.packusdw", dst_vec_type
, lo
, hi
);
322 res
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packsswb.128", src_vec_type
, lo
, hi
);
324 res
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packuswb.128", src_vec_type
, lo
, hi
);
329 return LLVMGetUndef(dst_vec_type
);
333 res
= LLVMBuildBitCast(builder
, res
, dst_vec_type
, "");
338 lo
= LLVMBuildBitCast(builder
, lo
, dst_vec_type
, "");
339 hi
= LLVMBuildBitCast(builder
, hi
, dst_vec_type
, "");
341 shuffle
= lp_build_const_pack_shuffle(dst_type
.length
);
343 res
= LLVMBuildShuffleVector(builder
, lo
, hi
, shuffle
, "");
350 lp_build_trunc(LLVMBuilderRef builder
,
351 union lp_type src_type
,
352 union lp_type dst_type
,
353 const LLVMValueRef
*src
, unsigned num_srcs
)
355 LLVMValueRef tmp
[LP_MAX_VECTOR_LENGTH
];
358 /* Register width must remain constant */
359 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
361 /* We must not loose or gain channels. Only precision */
362 assert(src_type
.length
* num_srcs
== dst_type
.length
);
364 for(i
= 0; i
< num_srcs
; ++i
)
367 while(src_type
.width
> dst_type
.width
) {
368 union lp_type new_type
= src_type
;
371 new_type
.length
*= 2;
373 /* Take in consideration the sign changes only in the last step */
374 if(new_type
.width
== dst_type
.width
)
375 new_type
.sign
= dst_type
.sign
;
379 for(i
= 0; i
< num_srcs
; ++i
)
380 tmp
[i
] = lp_build_pack2(builder
, src_type
, new_type
, tmp
[2*i
+ 0], tmp
[2*i
+ 1]);
385 assert(num_srcs
== 1);
392 * Convert between two SIMD types.
394 * Converting between SIMD types of different element width poses a problem:
395 * SIMD registers have a fixed number of bits, so different element widths
396 * imply different vector lengths. Therefore we must multiplex the multiple
397 * incoming sources into a single destination vector, or demux a single incoming
398 * vector into multiple vectors.
401 lp_build_conv(LLVMBuilderRef builder
,
402 union lp_type src_type
,
403 union lp_type dst_type
,
404 const LLVMValueRef
*src
, unsigned num_srcs
,
405 LLVMValueRef
*dst
, unsigned num_dsts
)
407 union lp_type tmp_type
;
408 LLVMValueRef tmp
[LP_MAX_VECTOR_LENGTH
];
412 /* Register width must remain constant */
413 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
415 /* We must not loose or gain channels. Only precision */
416 assert(src_type
.length
* num_srcs
== dst_type
.length
* num_dsts
);
418 assert(src_type
.length
<= LP_MAX_VECTOR_LENGTH
);
419 assert(dst_type
.length
<= LP_MAX_VECTOR_LENGTH
);
422 for(i
= 0; i
< num_srcs
; ++i
)
430 if(src_type
.value
!= dst_type
.value
) {
431 struct lp_build_context bld
;
432 double src_min
= lp_const_min(src_type
);
433 double dst_min
= lp_const_min(dst_type
);
434 double src_max
= lp_const_max(src_type
);
435 double dst_max
= lp_const_max(dst_type
);
438 lp_build_context_init(&bld
, builder
, tmp_type
);
440 if(src_min
< dst_min
) {
444 thres
= lp_build_const_uni(src_type
, dst_min
);
445 for(i
= 0; i
< num_tmps
; ++i
)
446 tmp
[i
] = lp_build_max(&bld
, tmp
[i
], thres
);
449 if(src_max
> dst_max
) {
453 thres
= lp_build_const_uni(src_type
, dst_max
);
454 for(i
= 0; i
< num_tmps
; ++i
)
455 tmp
[i
] = lp_build_min(&bld
, tmp
[i
], thres
);
460 * Scale to the narrowest range
463 if(dst_type
.floating
) {
466 else if(tmp_type
.floating
) {
467 if(!dst_type
.fixed
&& !dst_type
.sign
&& dst_type
.norm
) {
468 for(i
= 0; i
< num_tmps
; ++i
) {
469 tmp
[i
] = lp_build_clamped_float_to_unsigned_norm(builder
,
474 tmp_type
.floating
= FALSE
;
477 double dst_scale
= lp_const_scale(dst_type
);
478 LLVMTypeRef tmp_vec_type
;
480 if (dst_scale
!= 1.0) {
481 LLVMValueRef scale
= lp_build_const_uni(tmp_type
, dst_scale
);
482 for(i
= 0; i
< num_tmps
; ++i
)
483 tmp
[i
] = LLVMBuildMul(builder
, tmp
[i
], scale
, "");
486 /* Use an equally sized integer for intermediate computations */
487 tmp_type
.floating
= FALSE
;
488 tmp_vec_type
= lp_build_vec_type(tmp_type
);
489 for(i
= 0; i
< num_tmps
; ++i
) {
492 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
494 tmp
[i
] = LLVMBuildFPToUI(builder
, tmp
[i
], tmp_vec_type
, "");
496 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
497 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
503 unsigned src_shift
= lp_const_shift(src_type
);
504 unsigned dst_shift
= lp_const_shift(dst_type
);
506 /* FIXME: compensate different offsets too */
507 if(src_shift
> dst_shift
) {
508 LLVMValueRef shift
= lp_build_int_const_uni(tmp_type
, src_shift
- dst_shift
);
509 for(i
= 0; i
< num_tmps
; ++i
)
511 tmp
[i
] = LLVMBuildAShr(builder
, tmp
[i
], shift
, "");
513 tmp
[i
] = LLVMBuildLShr(builder
, tmp
[i
], shift
, "");
518 * Truncate or expand bit width
521 assert(!tmp_type
.floating
|| tmp_type
.width
== dst_type
.width
);
523 if(tmp_type
.width
> dst_type
.width
) {
524 assert(num_dsts
== 1);
525 tmp
[0] = lp_build_trunc(builder
, tmp_type
, dst_type
, tmp
, num_tmps
);
526 tmp_type
.width
= dst_type
.width
;
527 tmp_type
.length
= dst_type
.length
;
531 if(tmp_type
.width
< dst_type
.width
) {
532 assert(num_tmps
== 1);
533 lp_build_expand(builder
, tmp_type
, dst_type
, tmp
[0], tmp
, num_dsts
);
534 tmp_type
.width
= dst_type
.width
;
535 tmp_type
.length
= dst_type
.length
;
539 assert(tmp_type
.width
== dst_type
.width
);
540 assert(tmp_type
.length
== dst_type
.length
);
541 assert(num_tmps
== num_dsts
);
544 * Scale to the widest range
547 if(src_type
.floating
) {
550 else if(!src_type
.floating
&& dst_type
.floating
) {
551 if(!src_type
.fixed
&& !src_type
.sign
&& src_type
.norm
) {
552 for(i
= 0; i
< num_tmps
; ++i
) {
553 tmp
[i
] = lp_build_unsigned_norm_to_float(builder
,
558 tmp_type
.floating
= TRUE
;
561 double src_scale
= lp_const_scale(src_type
);
562 LLVMTypeRef tmp_vec_type
;
564 /* Use an equally sized integer for intermediate computations */
565 tmp_type
.floating
= TRUE
;
566 tmp_type
.sign
= TRUE
;
567 tmp_vec_type
= lp_build_vec_type(tmp_type
);
568 for(i
= 0; i
< num_tmps
; ++i
) {
571 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
573 tmp
[i
] = LLVMBuildUIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
575 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
576 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
580 if (src_scale
!= 1.0) {
581 LLVMValueRef scale
= lp_build_const_uni(tmp_type
, 1.0/src_scale
);
582 for(i
= 0; i
< num_tmps
; ++i
)
583 tmp
[i
] = LLVMBuildMul(builder
, tmp
[i
], scale
, "");
588 unsigned src_shift
= lp_const_shift(src_type
);
589 unsigned dst_shift
= lp_const_shift(dst_type
);
591 /* FIXME: compensate different offsets too */
592 if(src_shift
< dst_shift
) {
593 LLVMValueRef shift
= lp_build_int_const_uni(tmp_type
, dst_shift
- src_shift
);
594 for(i
= 0; i
< num_tmps
; ++i
)
595 tmp
[i
] = LLVMBuildShl(builder
, tmp
[i
], shift
, "");
599 for(i
= 0; i
< num_dsts
; ++i
)
605 * Convenience wrapper around lp_build_conv for bit masks.
608 lp_build_conv_mask(LLVMBuilderRef builder
,
609 union lp_type src_type
,
610 union lp_type dst_type
,
611 const LLVMValueRef
*src
, unsigned num_srcs
,
612 LLVMValueRef
*dst
, unsigned num_dsts
)
614 /* Register width must remain constant */
615 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
617 /* We must not loose or gain channels. Only precision */
618 assert(src_type
.length
* num_srcs
== dst_type
.length
* num_dsts
);
620 src_type
.floating
= FALSE
;
621 src_type
.fixed
= FALSE
;
622 src_type
.sign
= FALSE
;
623 src_type
.norm
= TRUE
;
625 dst_type
.floating
= FALSE
;
626 dst_type
.fixed
= FALSE
;
627 dst_type
.sign
= FALSE
;
628 dst_type
.norm
= TRUE
;
631 * Truncate or expand bit width
634 if(src_type
.width
> dst_type
.width
) {
635 assert(num_dsts
== 1);
636 dst
[0] = lp_build_trunc(builder
, src_type
, dst_type
, src
, num_srcs
);
638 else if(src_type
.width
< dst_type
.width
) {
639 assert(num_srcs
== 1);
640 lp_build_expand(builder
, src_type
, dst_type
, src
[0], dst
, num_dsts
);
643 assert(num_srcs
== num_dsts
);
644 memcpy(dst
, src
, num_dsts
* sizeof *dst
);