Merge branch 'vulkan' into 'vulkan'
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_pack.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for packing/unpacking.
32 *
33 * Pack/unpacking is necessary for conversion between types of different
34 * bit width.
35 *
36 * They are also commonly used when an computation needs higher
37 * precision for the intermediate values. For example, if one needs the
38 * function:
39 *
40 * c = compute(a, b);
41 *
42 * to use more precision for intermediate results then one should implement it
43 * as:
44 *
45 * LLVMValueRef
46 * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
47 * {
48 * struct lp_type wide_type = lp_wider_type(type);
49 * LLVMValueRef al, ah, bl, bh, cl, ch, c;
50 *
51 * lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
52 * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
53 *
54 * cl = compute_half(al, bl);
55 * ch = compute_half(ah, bh);
56 *
57 * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
58 *
59 * return c;
60 * }
61 *
62 * where compute_half() would do the computation for half the elements with
63 * twice the precision.
64 *
65 * @author Jose Fonseca <jfonseca@vmware.com>
66 */
67
68
69 #include "util/u_debug.h"
70 #include "util/u_math.h"
71 #include "util/u_cpu_detect.h"
72 #include "util/u_memory.h"
73
74 #include "lp_bld_type.h"
75 #include "lp_bld_const.h"
76 #include "lp_bld_init.h"
77 #include "lp_bld_intr.h"
78 #include "lp_bld_arit.h"
79 #include "lp_bld_pack.h"
80 #include "lp_bld_swizzle.h"
81
82
83 /**
84 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
85 */
86 static LLVMValueRef
87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
88 unsigned n, unsigned lo_hi)
89 {
90 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
91 unsigned i, j;
92
93 assert(n <= LP_MAX_VECTOR_LENGTH);
94 assert(lo_hi < 2);
95
96 /* TODO: cache results in a static table */
97
98 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
99 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
100 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
101 }
102
103 return LLVMConstVector(elems, n);
104 }
105
106 /**
107 * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
108 * See comment above lp_build_interleave2_half for more details.
109 */
110 static LLVMValueRef
111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
112 unsigned n, unsigned lo_hi)
113 {
114 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115 unsigned i, j;
116
117 assert(n <= LP_MAX_VECTOR_LENGTH);
118 assert(lo_hi < 2);
119
120 for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
121 if (i == (n / 2))
122 j += n / 4;
123
124 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
125 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
126 }
127
128 return LLVMConstVector(elems, n);
129 }
130
131 /**
132 * Build shuffle vectors that match PACKxx (SSE) instructions or
133 * VPERM (Altivec).
134 */
135 static LLVMValueRef
136 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
137 {
138 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
139 unsigned i;
140
141 assert(n <= LP_MAX_VECTOR_LENGTH);
142
143 for(i = 0; i < n; ++i)
144 #ifdef PIPE_ARCH_LITTLE_ENDIAN
145 elems[i] = lp_build_const_int32(gallivm, 2*i);
146 #else
147 elems[i] = lp_build_const_int32(gallivm, 2*i+1);
148 #endif
149
150 return LLVMConstVector(elems, n);
151 }
152
153 /**
154 * Return a vector with elements src[start:start+size]
155 * Most useful for getting half the values out of a 256bit sized vector,
156 * otherwise may cause data rearrangement to happen.
157 */
158 LLVMValueRef
159 lp_build_extract_range(struct gallivm_state *gallivm,
160 LLVMValueRef src,
161 unsigned start,
162 unsigned size)
163 {
164 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
165 unsigned i;
166
167 assert(size <= Elements(elems));
168
169 for (i = 0; i < size; ++i)
170 elems[i] = lp_build_const_int32(gallivm, i + start);
171
172 if (size == 1) {
173 return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
174 }
175 else {
176 return LLVMBuildShuffleVector(gallivm->builder, src, src,
177 LLVMConstVector(elems, size), "");
178 }
179 }
180
181 /**
182 * Concatenates several (must be a power of 2) vectors (of same type)
183 * into a larger one.
184 * Most useful for building up a 256bit sized vector out of two 128bit ones.
185 */
186 LLVMValueRef
187 lp_build_concat(struct gallivm_state *gallivm,
188 LLVMValueRef src[],
189 struct lp_type src_type,
190 unsigned num_vectors)
191 {
192 unsigned new_length, i;
193 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
194 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
195
196 assert(src_type.length * num_vectors <= Elements(shuffles));
197 assert(util_is_power_of_two(num_vectors));
198
199 new_length = src_type.length;
200
201 for (i = 0; i < num_vectors; i++)
202 tmp[i] = src[i];
203
204 while (num_vectors > 1) {
205 num_vectors >>= 1;
206 new_length <<= 1;
207 for (i = 0; i < new_length; i++) {
208 shuffles[i] = lp_build_const_int32(gallivm, i);
209 }
210 for (i = 0; i < num_vectors; i++) {
211 tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
212 LLVMConstVector(shuffles, new_length), "");
213 }
214 }
215
216 return tmp[0];
217 }
218
219
220 /**
221 * Combines vectors to reduce from num_srcs to num_dsts.
222 * Returns the number of src vectors concatenated in a single dst.
223 *
224 * num_srcs must be exactly divisible by num_dsts.
225 *
226 * e.g. For num_srcs = 4 and src = [x, y, z, w]
227 * num_dsts = 1 dst = [xyzw] return = 4
228 * num_dsts = 2 dst = [xy, zw] return = 2
229 */
230 int
231 lp_build_concat_n(struct gallivm_state *gallivm,
232 struct lp_type src_type,
233 LLVMValueRef *src,
234 unsigned num_srcs,
235 LLVMValueRef *dst,
236 unsigned num_dsts)
237 {
238 int size = num_srcs / num_dsts;
239 int i;
240
241 assert(num_srcs >= num_dsts);
242 assert((num_srcs % size) == 0);
243
244 if (num_srcs == num_dsts) {
245 for (i = 0; i < num_dsts; ++i) {
246 dst[i] = src[i];
247 }
248 return 1;
249 }
250
251 for (i = 0; i < num_dsts; ++i) {
252 dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
253 }
254
255 return size;
256 }
257
258
259 /**
260 * Interleave vector elements.
261 *
262 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
263 * (but not for 256bit AVX vectors).
264 */
265 LLVMValueRef
266 lp_build_interleave2(struct gallivm_state *gallivm,
267 struct lp_type type,
268 LLVMValueRef a,
269 LLVMValueRef b,
270 unsigned lo_hi)
271 {
272 LLVMValueRef shuffle;
273
274 if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
275 /*
276 * XXX: This is a workaround for llvm code generation deficiency. Strangely
277 * enough, while this needs vinsertf128/vextractf128 instructions (hence
278 * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
279 * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
280 * So use some different shuffles instead (the exact shuffles don't seem to
281 * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
282 */
283 struct lp_type tmp_type = type;
284 LLVMValueRef srchalf[2], tmpdst;
285 tmp_type.length = 4;
286 tmp_type.width = 64;
287 a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
288 b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
289 srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
290 srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
291 tmp_type.length = 2;
292 tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
293 return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
294 }
295
296 shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
297
298 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
299 }
300
301 /**
302 * Interleave vector elements but with 256 bit,
303 * treats it as interleave with 2 concatenated 128 bit vectors.
304 *
305 * This differs to lp_build_interleave2 as that function would do the following (for lo):
306 * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
307 *
308 *
309 * An example interleave 8x float with 8x float on AVX 256bit unpack:
310 * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
311 *
312 * Equivalent to interleaving 2x 128 bit vectors
313 * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
314 *
315 * So interleave-lo would result in:
316 * a0 b0 a1 b1 a4 b4 a5 b5
317 *
318 * And interleave-hi would result in:
319 * a2 b2 a3 b3 a6 b6 a7 b7
320 */
321 LLVMValueRef
322 lp_build_interleave2_half(struct gallivm_state *gallivm,
323 struct lp_type type,
324 LLVMValueRef a,
325 LLVMValueRef b,
326 unsigned lo_hi)
327 {
328 if (type.length * type.width == 256) {
329 LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
330 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
331 } else {
332 return lp_build_interleave2(gallivm, type, a, b, lo_hi);
333 }
334 }
335
336 /**
337 * Double the bit width.
338 *
339 * This will only change the number of bits the values are represented, not the
340 * values themselves.
341 */
342 void
343 lp_build_unpack2(struct gallivm_state *gallivm,
344 struct lp_type src_type,
345 struct lp_type dst_type,
346 LLVMValueRef src,
347 LLVMValueRef *dst_lo,
348 LLVMValueRef *dst_hi)
349 {
350 LLVMBuilderRef builder = gallivm->builder;
351 LLVMValueRef msb;
352 LLVMTypeRef dst_vec_type;
353
354 assert(!src_type.floating);
355 assert(!dst_type.floating);
356 assert(dst_type.width == src_type.width * 2);
357 assert(dst_type.length * 2 == src_type.length);
358
359 if(dst_type.sign && src_type.sign) {
360 /* Replicate the sign bit in the most significant bits */
361 msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
362 }
363 else
364 /* Most significant bits always zero */
365 msb = lp_build_zero(gallivm, src_type);
366
367 /* Interleave bits */
368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
369 *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
370 *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
371 #else
372 *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
373 *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
374 #endif
375
376 /* Cast the result into the new type (twice as wide) */
377
378 dst_vec_type = lp_build_vec_type(gallivm, dst_type);
379
380 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
381 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
382 }
383
384
385 /**
386 * Expand the bit width.
387 *
388 * This will only change the number of bits the values are represented, not the
389 * values themselves.
390 */
391 void
392 lp_build_unpack(struct gallivm_state *gallivm,
393 struct lp_type src_type,
394 struct lp_type dst_type,
395 LLVMValueRef src,
396 LLVMValueRef *dst, unsigned num_dsts)
397 {
398 unsigned num_tmps;
399 unsigned i;
400
401 /* Register width must remain constant */
402 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
403
404 /* We must not loose or gain channels. Only precision */
405 assert(src_type.length == dst_type.length * num_dsts);
406
407 num_tmps = 1;
408 dst[0] = src;
409
410 while(src_type.width < dst_type.width) {
411 struct lp_type tmp_type = src_type;
412
413 tmp_type.width *= 2;
414 tmp_type.length /= 2;
415
416 for(i = num_tmps; i--; ) {
417 lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
418 }
419
420 src_type = tmp_type;
421
422 num_tmps *= 2;
423 }
424
425 assert(num_tmps == num_dsts);
426 }
427
428
429 /**
430 * Non-interleaved pack.
431 *
432 * This will move values as
433 * (LSB) (MSB)
434 * lo = l0 __ l1 __ l2 __.. __ ln __
435 * hi = h0 __ h1 __ h2 __.. __ hn __
436 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
437 *
438 * This will only change the number of bits the values are represented, not the
439 * values themselves.
440 *
441 * It is assumed the values are already clamped into the destination type range.
442 * Values outside that range will produce undefined results. Use
443 * lp_build_packs2 instead.
444 */
445 LLVMValueRef
446 lp_build_pack2(struct gallivm_state *gallivm,
447 struct lp_type src_type,
448 struct lp_type dst_type,
449 LLVMValueRef lo,
450 LLVMValueRef hi)
451 {
452 LLVMBuilderRef builder = gallivm->builder;
453 LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
454 LLVMValueRef shuffle;
455 LLVMValueRef res = NULL;
456 struct lp_type intr_type = dst_type;
457
458 assert(!src_type.floating);
459 assert(!dst_type.floating);
460 assert(src_type.width == dst_type.width * 2);
461 assert(src_type.length * 2 == dst_type.length);
462
463 /* Check for special cases first */
464 if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
465 src_type.width * src_type.length >= 128) {
466 const char *intrinsic = NULL;
467 boolean swap_intrinsic_operands = FALSE;
468
469 switch(src_type.width) {
470 case 32:
471 if (util_cpu_caps.has_sse2) {
472 if (dst_type.sign) {
473 intrinsic = "llvm.x86.sse2.packssdw.128";
474 } else {
475 if (util_cpu_caps.has_sse4_1) {
476 intrinsic = "llvm.x86.sse41.packusdw";
477 }
478 }
479 } else if (util_cpu_caps.has_altivec) {
480 if (dst_type.sign) {
481 intrinsic = "llvm.ppc.altivec.vpkswss";
482 } else {
483 intrinsic = "llvm.ppc.altivec.vpkuwus";
484 }
485 #ifdef PIPE_ARCH_LITTLE_ENDIAN
486 swap_intrinsic_operands = TRUE;
487 #endif
488 }
489 break;
490 case 16:
491 if (dst_type.sign) {
492 if (util_cpu_caps.has_sse2) {
493 intrinsic = "llvm.x86.sse2.packsswb.128";
494 } else if (util_cpu_caps.has_altivec) {
495 intrinsic = "llvm.ppc.altivec.vpkshss";
496 #ifdef PIPE_ARCH_LITTLE_ENDIAN
497 swap_intrinsic_operands = TRUE;
498 #endif
499 }
500 } else {
501 if (util_cpu_caps.has_sse2) {
502 intrinsic = "llvm.x86.sse2.packuswb.128";
503 } else if (util_cpu_caps.has_altivec) {
504 intrinsic = "llvm.ppc.altivec.vpkshus";
505 #ifdef PIPE_ARCH_LITTLE_ENDIAN
506 swap_intrinsic_operands = TRUE;
507 #endif
508 }
509 }
510 break;
511 /* default uses generic shuffle below */
512 }
513 if (intrinsic) {
514 if (src_type.width * src_type.length == 128) {
515 LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
516 if (swap_intrinsic_operands) {
517 res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
518 } else {
519 res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
520 }
521 if (dst_vec_type != intr_vec_type) {
522 res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
523 }
524 }
525 else {
526 int num_split = src_type.width * src_type.length / 128;
527 int i;
528 int nlen = 128 / src_type.width;
529 int lo_off = swap_intrinsic_operands ? nlen : 0;
530 int hi_off = swap_intrinsic_operands ? 0 : nlen;
531 struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
532 struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
533 LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
534 LLVMValueRef tmplo, tmphi;
535 LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
536 LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
537
538 assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
539
540 for (i = 0; i < num_split / 2; i++) {
541 tmplo = lp_build_extract_range(gallivm,
542 lo, i*nlen*2 + lo_off, nlen);
543 tmphi = lp_build_extract_range(gallivm,
544 lo, i*nlen*2 + hi_off, nlen);
545 tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
546 nintr_vec_type, tmplo, tmphi);
547 if (ndst_vec_type != nintr_vec_type) {
548 tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
549 }
550 }
551 for (i = 0; i < num_split / 2; i++) {
552 tmplo = lp_build_extract_range(gallivm,
553 hi, i*nlen*2 + lo_off, nlen);
554 tmphi = lp_build_extract_range(gallivm,
555 hi, i*nlen*2 + hi_off, nlen);
556 tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
557 nintr_vec_type,
558 tmplo, tmphi);
559 if (ndst_vec_type != nintr_vec_type) {
560 tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
561 ndst_vec_type, "");
562 }
563 }
564 res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
565 }
566 return res;
567 }
568 }
569
570 /* generic shuffle */
571 lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
572 hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
573
574 shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
575
576 res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
577
578 return res;
579 }
580
581
582
583 /**
584 * Non-interleaved pack and saturate.
585 *
586 * Same as lp_build_pack2 but will saturate values so that they fit into the
587 * destination type.
588 */
589 LLVMValueRef
590 lp_build_packs2(struct gallivm_state *gallivm,
591 struct lp_type src_type,
592 struct lp_type dst_type,
593 LLVMValueRef lo,
594 LLVMValueRef hi)
595 {
596 boolean clamp;
597
598 assert(!src_type.floating);
599 assert(!dst_type.floating);
600 assert(src_type.sign == dst_type.sign);
601 assert(src_type.width == dst_type.width * 2);
602 assert(src_type.length * 2 == dst_type.length);
603
604 clamp = TRUE;
605
606 /* All X86 SSE non-interleaved pack instructions take signed inputs and
607 * saturate them, so no need to clamp for those cases. */
608 if(util_cpu_caps.has_sse2 &&
609 src_type.width * src_type.length >= 128 &&
610 src_type.sign &&
611 (src_type.width == 32 || src_type.width == 16))
612 clamp = FALSE;
613
614 if(clamp) {
615 struct lp_build_context bld;
616 unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
617 LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
618 lp_build_context_init(&bld, gallivm, src_type);
619 lo = lp_build_min(&bld, lo, dst_max);
620 hi = lp_build_min(&bld, hi, dst_max);
621 /* FIXME: What about lower bound? */
622 }
623
624 return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
625 }
626
627
628 /**
629 * Truncate the bit width.
630 *
631 * TODO: Handle saturation consistently.
632 */
633 LLVMValueRef
634 lp_build_pack(struct gallivm_state *gallivm,
635 struct lp_type src_type,
636 struct lp_type dst_type,
637 boolean clamped,
638 const LLVMValueRef *src, unsigned num_srcs)
639 {
640 LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
641 struct lp_type src_type,
642 struct lp_type dst_type,
643 LLVMValueRef lo,
644 LLVMValueRef hi);
645 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
646 unsigned i;
647
648 /* Register width must remain constant */
649 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
650
651 /* We must not loose or gain channels. Only precision */
652 assert(src_type.length * num_srcs == dst_type.length);
653
654 if(clamped)
655 pack2 = &lp_build_pack2;
656 else
657 pack2 = &lp_build_packs2;
658
659 for(i = 0; i < num_srcs; ++i)
660 tmp[i] = src[i];
661
662 while(src_type.width > dst_type.width) {
663 struct lp_type tmp_type = src_type;
664
665 tmp_type.width /= 2;
666 tmp_type.length *= 2;
667
668 /* Take in consideration the sign changes only in the last step */
669 if(tmp_type.width == dst_type.width)
670 tmp_type.sign = dst_type.sign;
671
672 num_srcs /= 2;
673
674 for(i = 0; i < num_srcs; ++i)
675 tmp[i] = pack2(gallivm, src_type, tmp_type,
676 tmp[2*i + 0], tmp[2*i + 1]);
677
678 src_type = tmp_type;
679 }
680
681 assert(num_srcs == 1);
682
683 return tmp[0];
684 }
685
686
687 /**
688 * Truncate or expand the bitwidth.
689 *
690 * NOTE: Getting the right sign flags is crucial here, as we employ some
691 * intrinsics that do saturation.
692 */
693 void
694 lp_build_resize(struct gallivm_state *gallivm,
695 struct lp_type src_type,
696 struct lp_type dst_type,
697 const LLVMValueRef *src, unsigned num_srcs,
698 LLVMValueRef *dst, unsigned num_dsts)
699 {
700 LLVMBuilderRef builder = gallivm->builder;
701 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
702 unsigned i;
703
704 /*
705 * We don't support float <-> int conversion here. That must be done
706 * before/after calling this function.
707 */
708 assert(src_type.floating == dst_type.floating);
709
710 /*
711 * We don't support double <-> float conversion yet, although it could be
712 * added with little effort.
713 */
714 assert((!src_type.floating && !dst_type.floating) ||
715 src_type.width == dst_type.width);
716
717 /* We must not loose or gain channels. Only precision */
718 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
719
720 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
721 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
722 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
723 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
724
725 if (src_type.width > dst_type.width) {
726 /*
727 * Truncate bit width.
728 */
729
730 /* Conversion must be M:1 */
731 assert(num_dsts == 1);
732
733 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
734 /*
735 * Register width remains constant -- use vector packing intrinsics
736 */
737 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
738 }
739 else {
740 if (src_type.width / dst_type.width > num_srcs) {
741 /*
742 * First change src vectors size (with shuffle) so they have the
743 * same size as the destination vector, then pack normally.
744 * Note: cannot use cast/extract because llvm generates atrocious code.
745 */
746 unsigned size_ratio = (src_type.width * src_type.length) /
747 (dst_type.length * dst_type.width);
748 unsigned new_length = src_type.length / size_ratio;
749
750 for (i = 0; i < size_ratio * num_srcs; i++) {
751 unsigned start_index = (i % size_ratio) * new_length;
752 tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
753 start_index, new_length);
754 }
755 num_srcs *= size_ratio;
756 src_type.length = new_length;
757 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
758 }
759 else {
760 /*
761 * Truncate bit width but expand vector size - first pack
762 * then expand simply because this should be more AVX-friendly
763 * for the cases we probably hit.
764 */
765 unsigned size_ratio = (dst_type.width * dst_type.length) /
766 (src_type.length * src_type.width);
767 unsigned num_pack_srcs = num_srcs / size_ratio;
768 dst_type.length = dst_type.length / size_ratio;
769
770 for (i = 0; i < size_ratio; i++) {
771 tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
772 &src[i*num_pack_srcs], num_pack_srcs);
773 }
774 tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
775 }
776 }
777 }
778 else if (src_type.width < dst_type.width) {
779 /*
780 * Expand bit width.
781 */
782
783 /* Conversion must be 1:N */
784 assert(num_srcs == 1);
785
786 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
787 /*
788 * Register width remains constant -- use vector unpack intrinsics
789 */
790 lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
791 }
792 else {
793 /*
794 * Do it element-wise.
795 */
796 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
797
798 for (i = 0; i < num_dsts; i++) {
799 tmp[i] = lp_build_undef(gallivm, dst_type);
800 }
801
802 for (i = 0; i < src_type.length; ++i) {
803 unsigned j = i / dst_type.length;
804 LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
805 LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
806 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
807
808 if (src_type.sign && dst_type.sign) {
809 val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
810 } else {
811 val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
812 }
813 tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
814 }
815 }
816 }
817 else {
818 /*
819 * No-op
820 */
821
822 /* "Conversion" must be N:N */
823 assert(num_srcs == num_dsts);
824
825 for(i = 0; i < num_dsts; ++i)
826 tmp[i] = src[i];
827 }
828
829 for(i = 0; i < num_dsts; ++i)
830 dst[i] = tmp[i];
831 }
832
833
834 /**
835 * Expands src vector from src.length to dst_length
836 */
837 LLVMValueRef
838 lp_build_pad_vector(struct gallivm_state *gallivm,
839 LLVMValueRef src,
840 unsigned dst_length)
841 {
842 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
843 LLVMValueRef undef;
844 LLVMTypeRef type;
845 unsigned i, src_length;
846
847 type = LLVMTypeOf(src);
848
849 if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
850 /* Can't use ShuffleVector on non-vector type */
851 undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
852 return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
853 }
854
855 undef = LLVMGetUndef(type);
856 src_length = LLVMGetVectorSize(type);
857
858 assert(dst_length <= Elements(elems));
859 assert(dst_length >= src_length);
860
861 if (src_length == dst_length)
862 return src;
863
864 /* All elements from src vector */
865 for (i = 0; i < src_length; ++i)
866 elems[i] = lp_build_const_int32(gallivm, i);
867
868 /* Undef fill remaining space */
869 for (i = src_length; i < dst_length; ++i)
870 elems[i] = lp_build_const_int32(gallivm, src_length);
871
872 /* Combine the two vectors */
873 return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
874 }