b7b630f2e8d2226d02e04fabd3b9af5ef8bfc0e3
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_pack.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for packing/unpacking.
32 *
33 * Pack/unpacking is necessary for conversion between types of different
34 * bit width.
35 *
36 * They are also commonly used when an computation needs higher
37 * precision for the intermediate values. For example, if one needs the
38 * function:
39 *
40 * c = compute(a, b);
41 *
42 * to use more precision for intermediate results then one should implement it
43 * as:
44 *
45 * LLVMValueRef
46 * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
47 * {
48 * struct lp_type wide_type = lp_wider_type(type);
49 * LLVMValueRef al, ah, bl, bh, cl, ch, c;
50 *
51 * lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
52 * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
53 *
54 * cl = compute_half(al, bl);
55 * ch = compute_half(ah, bh);
56 *
57 * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
58 *
59 * return c;
60 * }
61 *
62 * where compute_half() would do the computation for half the elements with
63 * twice the precision.
64 *
65 * @author Jose Fonseca <jfonseca@vmware.com>
66 */
67
68
69 #include "util/u_debug.h"
70 #include "util/u_math.h"
71 #include "util/u_cpu_detect.h"
72
73 #include "lp_bld_type.h"
74 #include "lp_bld_const.h"
75 #include "lp_bld_intr.h"
76 #include "lp_bld_arit.h"
77 #include "lp_bld_pack.h"
78
79
80 /**
81 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
82 */
83 static LLVMValueRef
84 lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
85 {
86 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
87 unsigned i, j;
88
89 assert(n <= LP_MAX_VECTOR_LENGTH);
90 assert(lo_hi < 2);
91
92 /* TODO: cache results in a static table */
93
94 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
95 elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
96 elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
97 }
98
99 return LLVMConstVector(elems, n);
100 }
101
102
103 /**
104 * Build shuffle vectors that match PACKxx instructions.
105 */
106 static LLVMValueRef
107 lp_build_const_pack_shuffle(unsigned n)
108 {
109 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
110 unsigned i;
111
112 assert(n <= LP_MAX_VECTOR_LENGTH);
113
114 /* TODO: cache results in a static table */
115
116 for(i = 0; i < n; ++i)
117 elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
118
119 return LLVMConstVector(elems, n);
120 }
121
122
123 /**
124 * Interleave vector elements.
125 *
126 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
127 */
128 LLVMValueRef
129 lp_build_interleave2(LLVMBuilderRef builder,
130 struct lp_type type,
131 LLVMValueRef a,
132 LLVMValueRef b,
133 unsigned lo_hi)
134 {
135 LLVMValueRef shuffle;
136
137 shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi);
138
139 return LLVMBuildShuffleVector(builder, a, b, shuffle, "");
140 }
141
142
143 /**
144 * Double the bit width.
145 *
146 * This will only change the number of bits the values are represented, not the
147 * values themselves.
148 */
149 void
150 lp_build_unpack2(LLVMBuilderRef builder,
151 struct lp_type src_type,
152 struct lp_type dst_type,
153 LLVMValueRef src,
154 LLVMValueRef *dst_lo,
155 LLVMValueRef *dst_hi)
156 {
157 LLVMValueRef msb;
158 LLVMTypeRef dst_vec_type;
159
160 assert(!src_type.floating);
161 assert(!dst_type.floating);
162 assert(dst_type.width == src_type.width * 2);
163 assert(dst_type.length * 2 == src_type.length);
164
165 if(dst_type.sign && src_type.sign) {
166 /* Replicate the sign bit in the most significant bits */
167 msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(src_type, src_type.width - 1), "");
168 }
169 else
170 /* Most significant bits always zero */
171 msb = lp_build_zero(src_type);
172
173 /* Interleave bits */
174 #ifdef PIPE_ARCH_LITTLE_ENDIAN
175 *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
176 *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
177 #else
178 *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
179 *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
180 #endif
181
182 /* Cast the result into the new type (twice as wide) */
183
184 dst_vec_type = lp_build_vec_type(dst_type);
185
186 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
187 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
188 }
189
190
191 /**
192 * Expand the bit width.
193 *
194 * This will only change the number of bits the values are represented, not the
195 * values themselves.
196 */
197 void
198 lp_build_unpack(LLVMBuilderRef builder,
199 struct lp_type src_type,
200 struct lp_type dst_type,
201 LLVMValueRef src,
202 LLVMValueRef *dst, unsigned num_dsts)
203 {
204 unsigned num_tmps;
205 unsigned i;
206
207 /* Register width must remain constant */
208 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
209
210 /* We must not loose or gain channels. Only precision */
211 assert(src_type.length == dst_type.length * num_dsts);
212
213 num_tmps = 1;
214 dst[0] = src;
215
216 while(src_type.width < dst_type.width) {
217 struct lp_type tmp_type = src_type;
218
219 tmp_type.width *= 2;
220 tmp_type.length /= 2;
221
222 for(i = num_tmps; i--; ) {
223 lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
224 }
225
226 src_type = tmp_type;
227
228 num_tmps *= 2;
229 }
230
231 assert(num_tmps == num_dsts);
232 }
233
234
235 /**
236 * Non-interleaved pack.
237 *
238 * This will move values as
239 *
240 * lo = __ l0 __ l1 __ l2 __.. __ ln
241 * hi = __ h0 __ h1 __ h2 __.. __ hn
242 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
243 *
244 * This will only change the number of bits the values are represented, not the
245 * values themselves.
246 *
247 * It is assumed the values are already clamped into the destination type range.
248 * Values outside that range will produce undefined results. Use
249 * lp_build_packs2 instead.
250 */
251 LLVMValueRef
252 lp_build_pack2(LLVMBuilderRef builder,
253 struct lp_type src_type,
254 struct lp_type dst_type,
255 LLVMValueRef lo,
256 LLVMValueRef hi)
257 {
258 #if HAVE_LLVM < 0x0207
259 LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
260 #endif
261 LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
262 LLVMValueRef shuffle;
263 LLVMValueRef res = NULL;
264
265 assert(!src_type.floating);
266 assert(!dst_type.floating);
267 assert(src_type.width == dst_type.width * 2);
268 assert(src_type.length * 2 == dst_type.length);
269
270 /* Check for special cases first */
271 if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
272 switch(src_type.width) {
273 case 32:
274 if(dst_type.sign) {
275 #if HAVE_LLVM >= 0x0207
276 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
277 #else
278 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
279 #endif
280 }
281 else {
282 if (util_cpu_caps.has_sse4_1) {
283 return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
284 }
285 else {
286 /* use generic shuffle below */
287 res = NULL;
288 }
289 }
290 break;
291
292 case 16:
293 if(dst_type.sign)
294 #if HAVE_LLVM >= 0x0207
295 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
296 #else
297 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
298 #endif
299 else
300 #if HAVE_LLVM >= 0x0207
301 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
302 #else
303 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
304 #endif
305 break;
306
307 default:
308 assert(0);
309 return LLVMGetUndef(dst_vec_type);
310 break;
311 }
312
313 if (res) {
314 res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
315 return res;
316 }
317 }
318
319 /* generic shuffle */
320 lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
321 hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
322
323 shuffle = lp_build_const_pack_shuffle(dst_type.length);
324
325 res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
326
327 return res;
328 }
329
330
331
332 /**
333 * Non-interleaved pack and saturate.
334 *
335 * Same as lp_build_pack2 but will saturate values so that they fit into the
336 * destination type.
337 */
338 LLVMValueRef
339 lp_build_packs2(LLVMBuilderRef builder,
340 struct lp_type src_type,
341 struct lp_type dst_type,
342 LLVMValueRef lo,
343 LLVMValueRef hi)
344 {
345 boolean clamp;
346
347 assert(!src_type.floating);
348 assert(!dst_type.floating);
349 assert(src_type.sign == dst_type.sign);
350 assert(src_type.width == dst_type.width * 2);
351 assert(src_type.length * 2 == dst_type.length);
352
353 clamp = TRUE;
354
355 /* All X86 SSE non-interleaved pack instructions take signed inputs and
356 * saturate them, so no need to clamp for those cases. */
357 if(util_cpu_caps.has_sse2 &&
358 src_type.width * src_type.length == 128 &&
359 src_type.sign)
360 clamp = FALSE;
361
362 if(clamp) {
363 struct lp_build_context bld;
364 unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
365 LLVMValueRef dst_max = lp_build_const_int_vec(src_type, ((unsigned long long)1 << dst_bits) - 1);
366 lp_build_context_init(&bld, builder, src_type);
367 lo = lp_build_min(&bld, lo, dst_max);
368 hi = lp_build_min(&bld, hi, dst_max);
369 /* FIXME: What about lower bound? */
370 }
371
372 return lp_build_pack2(builder, src_type, dst_type, lo, hi);
373 }
374
375
376 /**
377 * Truncate the bit width.
378 *
379 * TODO: Handle saturation consistently.
380 */
381 LLVMValueRef
382 lp_build_pack(LLVMBuilderRef builder,
383 struct lp_type src_type,
384 struct lp_type dst_type,
385 boolean clamped,
386 const LLVMValueRef *src, unsigned num_srcs)
387 {
388 LLVMValueRef (*pack2)(LLVMBuilderRef builder,
389 struct lp_type src_type,
390 struct lp_type dst_type,
391 LLVMValueRef lo,
392 LLVMValueRef hi);
393 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
394 unsigned i;
395
396
397 /* Register width must remain constant */
398 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
399
400 /* We must not loose or gain channels. Only precision */
401 assert(src_type.length * num_srcs == dst_type.length);
402
403 if(clamped)
404 pack2 = &lp_build_pack2;
405 else
406 pack2 = &lp_build_packs2;
407
408 for(i = 0; i < num_srcs; ++i)
409 tmp[i] = src[i];
410
411 while(src_type.width > dst_type.width) {
412 struct lp_type tmp_type = src_type;
413
414 tmp_type.width /= 2;
415 tmp_type.length *= 2;
416
417 /* Take in consideration the sign changes only in the last step */
418 if(tmp_type.width == dst_type.width)
419 tmp_type.sign = dst_type.sign;
420
421 num_srcs /= 2;
422
423 for(i = 0; i < num_srcs; ++i)
424 tmp[i] = pack2(builder, src_type, tmp_type, tmp[2*i + 0], tmp[2*i + 1]);
425
426 src_type = tmp_type;
427 }
428
429 assert(num_srcs == 1);
430
431 return tmp[0];
432 }
433
434
435 /**
436 * Truncate or expand the bitwidth.
437 *
438 * NOTE: Getting the right sign flags is crucial here, as we employ some
439 * intrinsics that do saturation.
440 */
441 void
442 lp_build_resize(LLVMBuilderRef builder,
443 struct lp_type src_type,
444 struct lp_type dst_type,
445 const LLVMValueRef *src, unsigned num_srcs,
446 LLVMValueRef *dst, unsigned num_dsts)
447 {
448 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
449 unsigned i;
450
451 /*
452 * We don't support float <-> int conversion here. That must be done
453 * before/after calling this function.
454 */
455 assert(src_type.floating == dst_type.floating);
456
457 /*
458 * We don't support double <-> float conversion yet, although it could be
459 * added with little effort.
460 */
461 assert((!src_type.floating && !dst_type.floating) ||
462 src_type.width == dst_type.width);
463
464 /* We must not loose or gain channels. Only precision */
465 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
466
467 /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
468 assert(num_srcs == 1 || num_dsts == 1);
469
470 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
471 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
472 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
473 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
474
475 if (src_type.width > dst_type.width) {
476 /*
477 * Truncate bit width.
478 */
479
480 assert(num_dsts == 1);
481
482 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
483 /*
484 * Register width remains constant -- use vector packing intrinsics
485 */
486
487 tmp[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
488 }
489 else {
490 /*
491 * Do it element-wise.
492 */
493
494 assert(src_type.length == dst_type.length);
495 tmp[0] = lp_build_undef(dst_type);
496 for (i = 0; i < dst_type.length; ++i) {
497 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
498 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
499 val = LLVMBuildTrunc(builder, val, lp_build_elem_type(dst_type), "");
500 tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
501 }
502 }
503 }
504 else if (src_type.width < dst_type.width) {
505 /*
506 * Expand bit width.
507 */
508
509 assert(num_srcs == 1);
510
511 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
512 /*
513 * Register width remains constant -- use vector unpack intrinsics
514 */
515 lp_build_unpack(builder, src_type, dst_type, src[0], tmp, num_dsts);
516 }
517 else {
518 /*
519 * Do it element-wise.
520 */
521
522 assert(src_type.length == dst_type.length);
523 tmp[0] = lp_build_undef(dst_type);
524 for (i = 0; i < dst_type.length; ++i) {
525 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
526 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
527
528 if (src_type.sign && dst_type.sign) {
529 val = LLVMBuildSExt(builder, val, lp_build_elem_type(dst_type), "");
530 } else {
531 val = LLVMBuildZExt(builder, val, lp_build_elem_type(dst_type), "");
532 }
533 tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
534 }
535 }
536 }
537 else {
538 /*
539 * No-op
540 */
541
542 assert(num_srcs == 1);
543 assert(num_dsts == 1);
544
545 tmp[0] = src[0];
546 }
547
548 for(i = 0; i < num_dsts; ++i)
549 dst[i] = tmp[i];
550 }
551
552