f7eb7148ab8332866c84c2982b8d12cda18c523d
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_pack.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for packing/unpacking.
32 *
33 * Pack/unpacking is necessary for conversion between types of different
34 * bit width.
35 *
36 * They are also commonly used when an computation needs higher
37 * precision for the intermediate values. For example, if one needs the
38 * function:
39 *
40 * c = compute(a, b);
41 *
42 * to use more precision for intermediate results then one should implement it
43 * as:
44 *
45 * LLVMValueRef
46 * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
47 * {
48 * struct lp_type wide_type = lp_wider_type(type);
49 * LLVMValueRef al, ah, bl, bh, cl, ch, c;
50 *
51 * lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
52 * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
53 *
54 * cl = compute_half(al, bl);
55 * ch = compute_half(ah, bh);
56 *
57 * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
58 *
59 * return c;
60 * }
61 *
62 * where compute_half() would do the computation for half the elements with
63 * twice the precision.
64 *
65 * @author Jose Fonseca <jfonseca@vmware.com>
66 */
67
68
69 #include "util/u_debug.h"
70 #include "util/u_math.h"
71 #include "util/u_cpu_detect.h"
72
73 #include "lp_bld_type.h"
74 #include "lp_bld_const.h"
75 #include "lp_bld_intr.h"
76 #include "lp_bld_arit.h"
77 #include "lp_bld_pack.h"
78
79
80 /**
81 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
82 */
83 static LLVMValueRef
84 lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
85 {
86 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
87 unsigned i, j;
88
89 assert(n <= LP_MAX_VECTOR_LENGTH);
90 assert(lo_hi < 2);
91
92 /* TODO: cache results in a static table */
93
94 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
95 elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
96 elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
97 }
98
99 return LLVMConstVector(elems, n);
100 }
101
102
103 /**
104 * Build shuffle vectors that match PACKxx instructions.
105 */
106 static LLVMValueRef
107 lp_build_const_pack_shuffle(unsigned n)
108 {
109 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
110 unsigned i;
111
112 assert(n <= LP_MAX_VECTOR_LENGTH);
113
114 for(i = 0; i < n; ++i)
115 elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
116
117 return LLVMConstVector(elems, n);
118 }
119
120
121 /**
122 * Interleave vector elements.
123 *
124 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
125 */
126 LLVMValueRef
127 lp_build_interleave2(LLVMBuilderRef builder,
128 struct lp_type type,
129 LLVMValueRef a,
130 LLVMValueRef b,
131 unsigned lo_hi)
132 {
133 LLVMValueRef shuffle;
134
135 shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi);
136
137 return LLVMBuildShuffleVector(builder, a, b, shuffle, "");
138 }
139
140
141 /**
142 * Double the bit width.
143 *
144 * This will only change the number of bits the values are represented, not the
145 * values themselves.
146 */
147 void
148 lp_build_unpack2(LLVMBuilderRef builder,
149 struct lp_type src_type,
150 struct lp_type dst_type,
151 LLVMValueRef src,
152 LLVMValueRef *dst_lo,
153 LLVMValueRef *dst_hi)
154 {
155 LLVMValueRef msb;
156 LLVMTypeRef dst_vec_type;
157
158 assert(!src_type.floating);
159 assert(!dst_type.floating);
160 assert(dst_type.width == src_type.width * 2);
161 assert(dst_type.length * 2 == src_type.length);
162
163 if(dst_type.sign && src_type.sign) {
164 /* Replicate the sign bit in the most significant bits */
165 msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(src_type, src_type.width - 1), "");
166 }
167 else
168 /* Most significant bits always zero */
169 msb = lp_build_zero(src_type);
170
171 /* Interleave bits */
172 #ifdef PIPE_ARCH_LITTLE_ENDIAN
173 *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
174 *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
175 #else
176 *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
177 *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
178 #endif
179
180 /* Cast the result into the new type (twice as wide) */
181
182 dst_vec_type = lp_build_vec_type(dst_type);
183
184 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
185 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
186 }
187
188
189 /**
190 * Expand the bit width.
191 *
192 * This will only change the number of bits the values are represented, not the
193 * values themselves.
194 */
195 void
196 lp_build_unpack(LLVMBuilderRef builder,
197 struct lp_type src_type,
198 struct lp_type dst_type,
199 LLVMValueRef src,
200 LLVMValueRef *dst, unsigned num_dsts)
201 {
202 unsigned num_tmps;
203 unsigned i;
204
205 /* Register width must remain constant */
206 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
207
208 /* We must not loose or gain channels. Only precision */
209 assert(src_type.length == dst_type.length * num_dsts);
210
211 num_tmps = 1;
212 dst[0] = src;
213
214 while(src_type.width < dst_type.width) {
215 struct lp_type tmp_type = src_type;
216
217 tmp_type.width *= 2;
218 tmp_type.length /= 2;
219
220 for(i = num_tmps; i--; ) {
221 lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
222 }
223
224 src_type = tmp_type;
225
226 num_tmps *= 2;
227 }
228
229 assert(num_tmps == num_dsts);
230 }
231
232
233 /**
234 * Non-interleaved pack.
235 *
236 * This will move values as
237 *
238 * lo = __ l0 __ l1 __ l2 __.. __ ln
239 * hi = __ h0 __ h1 __ h2 __.. __ hn
240 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
241 *
242 * This will only change the number of bits the values are represented, not the
243 * values themselves.
244 *
245 * It is assumed the values are already clamped into the destination type range.
246 * Values outside that range will produce undefined results. Use
247 * lp_build_packs2 instead.
248 */
249 LLVMValueRef
250 lp_build_pack2(LLVMBuilderRef builder,
251 struct lp_type src_type,
252 struct lp_type dst_type,
253 LLVMValueRef lo,
254 LLVMValueRef hi)
255 {
256 #if HAVE_LLVM < 0x0207
257 LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
258 #endif
259 LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
260 LLVMValueRef shuffle;
261 LLVMValueRef res = NULL;
262
263 assert(!src_type.floating);
264 assert(!dst_type.floating);
265 assert(src_type.width == dst_type.width * 2);
266 assert(src_type.length * 2 == dst_type.length);
267
268 /* Check for special cases first */
269 if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
270 switch(src_type.width) {
271 case 32:
272 if(dst_type.sign) {
273 #if HAVE_LLVM >= 0x0207
274 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", dst_vec_type, lo, hi);
275 #else
276 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
277 #endif
278 }
279 else {
280 if (util_cpu_caps.has_sse4_1) {
281 return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
282 }
283 else {
284 /* use generic shuffle below */
285 res = NULL;
286 }
287 }
288 break;
289
290 case 16:
291 if(dst_type.sign)
292 #if HAVE_LLVM >= 0x0207
293 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", dst_vec_type, lo, hi);
294 #else
295 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
296 #endif
297 else
298 #if HAVE_LLVM >= 0x0207
299 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", dst_vec_type, lo, hi);
300 #else
301 res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
302 #endif
303 break;
304
305 default:
306 assert(0);
307 return LLVMGetUndef(dst_vec_type);
308 break;
309 }
310
311 if (res) {
312 res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
313 return res;
314 }
315 }
316
317 /* generic shuffle */
318 lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
319 hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
320
321 shuffle = lp_build_const_pack_shuffle(dst_type.length);
322
323 res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
324
325 return res;
326 }
327
328
329
330 /**
331 * Non-interleaved pack and saturate.
332 *
333 * Same as lp_build_pack2 but will saturate values so that they fit into the
334 * destination type.
335 */
336 LLVMValueRef
337 lp_build_packs2(LLVMBuilderRef builder,
338 struct lp_type src_type,
339 struct lp_type dst_type,
340 LLVMValueRef lo,
341 LLVMValueRef hi)
342 {
343 boolean clamp;
344
345 assert(!src_type.floating);
346 assert(!dst_type.floating);
347 assert(src_type.sign == dst_type.sign);
348 assert(src_type.width == dst_type.width * 2);
349 assert(src_type.length * 2 == dst_type.length);
350
351 clamp = TRUE;
352
353 /* All X86 SSE non-interleaved pack instructions take signed inputs and
354 * saturate them, so no need to clamp for those cases. */
355 if(util_cpu_caps.has_sse2 &&
356 src_type.width * src_type.length == 128 &&
357 src_type.sign)
358 clamp = FALSE;
359
360 if(clamp) {
361 struct lp_build_context bld;
362 unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
363 LLVMValueRef dst_max = lp_build_const_int_vec(src_type, ((unsigned long long)1 << dst_bits) - 1);
364 lp_build_context_init(&bld, builder, src_type);
365 lo = lp_build_min(&bld, lo, dst_max);
366 hi = lp_build_min(&bld, hi, dst_max);
367 /* FIXME: What about lower bound? */
368 }
369
370 return lp_build_pack2(builder, src_type, dst_type, lo, hi);
371 }
372
373
374 /**
375 * Truncate the bit width.
376 *
377 * TODO: Handle saturation consistently.
378 */
379 LLVMValueRef
380 lp_build_pack(LLVMBuilderRef builder,
381 struct lp_type src_type,
382 struct lp_type dst_type,
383 boolean clamped,
384 const LLVMValueRef *src, unsigned num_srcs)
385 {
386 LLVMValueRef (*pack2)(LLVMBuilderRef builder,
387 struct lp_type src_type,
388 struct lp_type dst_type,
389 LLVMValueRef lo,
390 LLVMValueRef hi);
391 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
392 unsigned i;
393
394
395 /* Register width must remain constant */
396 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
397
398 /* We must not loose or gain channels. Only precision */
399 assert(src_type.length * num_srcs == dst_type.length);
400
401 if(clamped)
402 pack2 = &lp_build_pack2;
403 else
404 pack2 = &lp_build_packs2;
405
406 for(i = 0; i < num_srcs; ++i)
407 tmp[i] = src[i];
408
409 while(src_type.width > dst_type.width) {
410 struct lp_type tmp_type = src_type;
411
412 tmp_type.width /= 2;
413 tmp_type.length *= 2;
414
415 /* Take in consideration the sign changes only in the last step */
416 if(tmp_type.width == dst_type.width)
417 tmp_type.sign = dst_type.sign;
418
419 num_srcs /= 2;
420
421 for(i = 0; i < num_srcs; ++i)
422 tmp[i] = pack2(builder, src_type, tmp_type, tmp[2*i + 0], tmp[2*i + 1]);
423
424 src_type = tmp_type;
425 }
426
427 assert(num_srcs == 1);
428
429 return tmp[0];
430 }
431
432
433 /**
434 * Truncate or expand the bitwidth.
435 *
436 * NOTE: Getting the right sign flags is crucial here, as we employ some
437 * intrinsics that do saturation.
438 */
439 void
440 lp_build_resize(LLVMBuilderRef builder,
441 struct lp_type src_type,
442 struct lp_type dst_type,
443 const LLVMValueRef *src, unsigned num_srcs,
444 LLVMValueRef *dst, unsigned num_dsts)
445 {
446 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
447 unsigned i;
448
449 /*
450 * We don't support float <-> int conversion here. That must be done
451 * before/after calling this function.
452 */
453 assert(src_type.floating == dst_type.floating);
454
455 /*
456 * We don't support double <-> float conversion yet, although it could be
457 * added with little effort.
458 */
459 assert((!src_type.floating && !dst_type.floating) ||
460 src_type.width == dst_type.width);
461
462 /* We must not loose or gain channels. Only precision */
463 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
464
465 /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
466 assert(num_srcs == 1 || num_dsts == 1);
467
468 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
469 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
470 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
471 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
472
473 if (src_type.width > dst_type.width) {
474 /*
475 * Truncate bit width.
476 */
477
478 assert(num_dsts == 1);
479
480 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
481 /*
482 * Register width remains constant -- use vector packing intrinsics
483 */
484
485 tmp[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
486 }
487 else {
488 /*
489 * Do it element-wise.
490 */
491
492 assert(src_type.length == dst_type.length);
493 tmp[0] = lp_build_undef(dst_type);
494 for (i = 0; i < dst_type.length; ++i) {
495 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
496 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
497 val = LLVMBuildTrunc(builder, val, lp_build_elem_type(dst_type), "");
498 tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
499 }
500 }
501 }
502 else if (src_type.width < dst_type.width) {
503 /*
504 * Expand bit width.
505 */
506
507 assert(num_srcs == 1);
508
509 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
510 /*
511 * Register width remains constant -- use vector unpack intrinsics
512 */
513 lp_build_unpack(builder, src_type, dst_type, src[0], tmp, num_dsts);
514 }
515 else {
516 /*
517 * Do it element-wise.
518 */
519
520 assert(src_type.length == dst_type.length);
521 tmp[0] = lp_build_undef(dst_type);
522 for (i = 0; i < dst_type.length; ++i) {
523 LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
524 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
525
526 if (src_type.sign && dst_type.sign) {
527 val = LLVMBuildSExt(builder, val, lp_build_elem_type(dst_type), "");
528 } else {
529 val = LLVMBuildZExt(builder, val, lp_build_elem_type(dst_type), "");
530 }
531 tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
532 }
533 }
534 }
535 else {
536 /*
537 * No-op
538 */
539
540 assert(num_srcs == 1);
541 assert(num_dsts == 1);
542
543 tmp[0] = src[0];
544 }
545
546 for(i = 0; i < num_dsts; ++i)
547 dst[i] = tmp[i];
548 }
549
550