1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_debug.h"
50 #include "lp_bld_type.h"
51 #include "lp_bld_const.h"
52 #include "lp_bld_intr.h"
53 #include "lp_bld_conv.h"
57 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
60 lp_build_const_expand_shuffle(unsigned n
, unsigned lo_hi
)
62 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
65 assert(n
<= LP_MAX_VECTOR_LENGTH
);
68 /* TODO: cache results in a static table */
70 for(i
= 0, j
= lo_hi
*n
/2; i
< n
; i
+= 2, ++j
) {
71 elems
[i
+ 0] = LLVMConstInt(LLVMInt32Type(), 0 + j
, 0);
72 elems
[i
+ 1] = LLVMConstInt(LLVMInt32Type(), n
+ j
, 0);
75 return LLVMConstVector(elems
, n
);
80 lp_build_expand(LLVMBuilderRef builder
,
81 union lp_type src_type
,
82 union lp_type dst_type
,
84 LLVMValueRef
*dst
, unsigned num_dsts
)
89 /* Register width must remain constant */
90 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
92 /* We must not loose or gain channels. Only precision */
93 assert(src_type
.length
== dst_type
.length
* num_dsts
);
98 while(src_type
.width
< dst_type
.width
) {
99 union lp_type new_type
= src_type
;
100 LLVMTypeRef new_vec_type
;
103 new_type
.length
/= 2;
104 new_vec_type
= lp_build_vec_type(new_type
);
106 for(i
= num_tmps
; i
--; ) {
108 LLVMValueRef shuffle_lo
;
109 LLVMValueRef shuffle_hi
;
113 zero
= lp_build_zero(src_type
);
114 shuffle_lo
= lp_build_const_expand_shuffle(src_type
.length
, 0);
115 shuffle_hi
= lp_build_const_expand_shuffle(src_type
.length
, 1);
117 /* PUNPCKLBW, PUNPCKHBW */
118 lo
= LLVMBuildShuffleVector(builder
, dst
[i
], zero
, shuffle_lo
, "");
119 hi
= LLVMBuildShuffleVector(builder
, dst
[i
], zero
, shuffle_hi
, "");
121 dst
[2*i
+ 0] = LLVMBuildBitCast(builder
, lo
, new_vec_type
, "");
122 dst
[2*i
+ 1] = LLVMBuildBitCast(builder
, hi
, new_vec_type
, "");
130 assert(num_tmps
== num_dsts
);
135 lp_build_trunc(LLVMBuilderRef builder
,
136 union lp_type src_type
,
137 union lp_type dst_type
,
138 const LLVMValueRef
*src
, unsigned num_srcs
)
140 LLVMValueRef tmp
[LP_MAX_VECTOR_LENGTH
];
143 /* Register width must remain constant */
144 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
146 /* We must not loose or gain channels. Only precision */
147 assert(src_type
.length
* num_srcs
== dst_type
.length
);
149 for(i
= 0; i
< num_srcs
; ++i
)
152 while(src_type
.width
> dst_type
.width
) {
153 LLVMTypeRef tmp_vec_type
= lp_build_vec_type(src_type
);
154 union lp_type new_type
= src_type
;
155 LLVMTypeRef new_vec_type
;
158 new_type
.length
*= 2;
159 new_vec_type
= lp_build_vec_type(new_type
);
161 for(i
= 0; i
< num_srcs
/2; ++i
) {
162 LLVMValueRef lo
= tmp
[2*i
+ 0];
163 LLVMValueRef hi
= tmp
[2*i
+ 1];
164 LLVMValueRef packed
= NULL
;
166 if(src_type
.width
== 32) {
167 /* FIXME: we only have a packed signed intrinsic */
168 packed
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packssdw.128", tmp_vec_type
, lo
, hi
);
170 else if(src_type
.width
== 16) {
172 packed
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packsswb.128", tmp_vec_type
, lo
, hi
);
174 packed
= lp_build_intrinsic_binary(builder
, "llvm.x86.sse2.packuswb.128", tmp_vec_type
, lo
, hi
);
179 tmp
[i
] = LLVMBuildBitCast(builder
, packed
, new_vec_type
, "");
187 assert(num_srcs
== 1);
194 * Convert between two SIMD types.
196 * Converting between SIMD types of different element width poses a problem:
197 * SIMD registers have a fixed number of bits, so different element widths
198 * imply different vector lengths. Therefore we must multiplex the multiple
199 * incoming sources into a single destination vector, or demux a single incoming
200 * vector into multiple vectors.
203 lp_build_conv(LLVMBuilderRef builder
,
204 union lp_type src_type
,
205 union lp_type dst_type
,
206 const LLVMValueRef
*src
, unsigned num_srcs
,
207 LLVMValueRef
*dst
, unsigned num_dsts
)
209 union lp_type tmp_type
;
210 LLVMValueRef tmp
[LP_MAX_VECTOR_LENGTH
];
214 /* Register width must remain constant */
215 assert(src_type
.width
* src_type
.length
== dst_type
.width
* dst_type
.length
);
217 /* We must not loose or gain channels. Only precision */
218 assert(src_type
.length
* num_srcs
== dst_type
.length
* num_dsts
);
220 assert(src_type
.length
<= LP_MAX_VECTOR_LENGTH
);
221 assert(dst_type
.length
<= LP_MAX_VECTOR_LENGTH
);
224 for(i
= 0; i
< num_srcs
; ++i
)
232 if(!tmp_type
.norm
&& dst_type
.norm
) {
237 * Scale to the narrowest range
240 if(dst_type
.floating
) {
243 else if(tmp_type
.floating
) {
244 double dst_scale
= lp_const_scale(dst_type
);
245 LLVMTypeRef tmp_vec_type
;
247 if (dst_scale
!= 1.0) {
248 LLVMValueRef scale
= lp_build_const_uni(tmp_type
, dst_scale
);
249 for(i
= 0; i
< num_tmps
; ++i
)
250 tmp
[i
] = LLVMBuildMul(builder
, tmp
[i
], scale
, "");
253 /* Use an equally sized integer for intermediate computations */
254 tmp_type
.floating
= FALSE
;
255 tmp_vec_type
= lp_build_vec_type(tmp_type
);
256 for(i
= 0; i
< num_tmps
; ++i
) {
259 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
261 tmp
[i
] = LLVMBuildFPToUI(builder
, tmp
[i
], tmp_vec_type
, "");
263 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
264 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
269 unsigned src_shift
= lp_const_shift(src_type
);
270 unsigned dst_shift
= lp_const_shift(dst_type
);
272 /* FIXME: compensate different offsets too */
273 if(src_shift
> dst_shift
) {
274 LLVMValueRef shift
= lp_build_int_const_uni(tmp_type
, src_shift
- dst_shift
);
275 for(i
= 0; i
< num_tmps
; ++i
)
277 tmp
[i
] = LLVMBuildAShr(builder
, tmp
[i
], shift
, "");
279 tmp
[i
] = LLVMBuildLShr(builder
, tmp
[i
], shift
, "");
284 * Truncate or expand bit width
287 assert(!tmp_type
.floating
);
289 if(tmp_type
.width
> dst_type
.width
) {
290 assert(num_dsts
== 1);
291 tmp
[0] = lp_build_trunc(builder
, tmp_type
, dst_type
, tmp
, num_tmps
);
292 tmp_type
.width
= dst_type
.width
;
293 tmp_type
.length
= dst_type
.length
;
297 if(tmp_type
.width
< dst_type
.width
) {
298 assert(num_tmps
== 1);
299 lp_build_expand(builder
, tmp_type
, dst_type
, tmp
[0], tmp
, num_dsts
);
300 tmp_type
.width
= dst_type
.width
;
301 tmp_type
.length
= dst_type
.length
;
305 assert(tmp_type
.width
== dst_type
.width
);
306 assert(tmp_type
.length
== dst_type
.length
);
307 assert(num_tmps
== num_dsts
);
310 * Scale to the widest range
313 if(src_type
.floating
) {
316 else if(!src_type
.floating
&& dst_type
.floating
) {
317 double src_scale
= lp_const_scale(src_type
);
318 LLVMTypeRef tmp_vec_type
;
320 /* Use an equally sized integer for intermediate computations */
321 tmp_type
.floating
= TRUE
;
322 tmp_type
.sign
= TRUE
;
323 tmp_vec_type
= lp_build_vec_type(tmp_type
);
324 for(i
= 0; i
< num_tmps
; ++i
) {
327 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
329 tmp
[i
] = LLVMBuildUIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
331 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
332 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
336 if (src_scale
!= 1.0) {
337 LLVMValueRef scale
= lp_build_const_uni(tmp_type
, 1.0/src_scale
);
338 for(i
= 0; i
< num_tmps
; ++i
)
339 tmp
[i
] = LLVMBuildMul(builder
, tmp
[i
], scale
, "");
343 unsigned src_shift
= lp_const_shift(src_type
);
344 unsigned dst_shift
= lp_const_shift(dst_type
);
346 /* FIXME: compensate different offsets too */
347 if(src_shift
< dst_shift
) {
348 LLVMValueRef shift
= lp_build_int_const_uni(tmp_type
, src_shift
- dst_shift
);
349 for(i
= 0; i
< num_tmps
; ++i
)
350 tmp
[i
] = LLVMBuildShl(builder
, tmp
[i
], shift
, "");
354 for(i
= 0; i
< num_dsts
; ++i
)