gallivm: fix no-op n:n lp_build_resize()
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_pack.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for packing/unpacking.
32 *
33 * Pack/unpacking is necessary for conversion between types of different
34 * bit width.
35 *
36 * They are also commonly used when an computation needs higher
37 * precision for the intermediate values. For example, if one needs the
38 * function:
39 *
40 * c = compute(a, b);
41 *
42 * to use more precision for intermediate results then one should implement it
43 * as:
44 *
45 * LLVMValueRef
46 * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
47 * {
48 * struct lp_type wide_type = lp_wider_type(type);
49 * LLVMValueRef al, ah, bl, bh, cl, ch, c;
50 *
51 * lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
52 * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
53 *
54 * cl = compute_half(al, bl);
55 * ch = compute_half(ah, bh);
56 *
57 * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
58 *
59 * return c;
60 * }
61 *
62 * where compute_half() would do the computation for half the elements with
63 * twice the precision.
64 *
65 * @author Jose Fonseca <jfonseca@vmware.com>
66 */
67
68
69 #include "util/u_debug.h"
70 #include "util/u_math.h"
71 #include "util/u_cpu_detect.h"
72 #include "util/u_memory.h"
73
74 #include "lp_bld_type.h"
75 #include "lp_bld_const.h"
76 #include "lp_bld_init.h"
77 #include "lp_bld_intr.h"
78 #include "lp_bld_arit.h"
79 #include "lp_bld_pack.h"
80 #include "lp_bld_swizzle.h"
81
82
83 /**
84 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
85 */
86 static LLVMValueRef
87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
88 unsigned n, unsigned lo_hi)
89 {
90 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
91 unsigned i, j;
92
93 assert(n <= LP_MAX_VECTOR_LENGTH);
94 assert(lo_hi < 2);
95
96 /* TODO: cache results in a static table */
97
98 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
99 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
100 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
101 }
102
103 return LLVMConstVector(elems, n);
104 }
105
106 /**
107 * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
108 * See comment above lp_build_interleave2_half for more details.
109 */
110 static LLVMValueRef
111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
112 unsigned n, unsigned lo_hi)
113 {
114 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115 unsigned i, j;
116
117 assert(n <= LP_MAX_VECTOR_LENGTH);
118 assert(lo_hi < 2);
119
120 for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
121 if (i == (n / 2))
122 j += n / 4;
123
124 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
125 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
126 }
127
128 return LLVMConstVector(elems, n);
129 }
130
131 /**
132 * Build shuffle vectors that match PACKxx (SSE) instructions or
133 * VPERM (Altivec).
134 */
135 static LLVMValueRef
136 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
137 {
138 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
139 unsigned i;
140
141 assert(n <= LP_MAX_VECTOR_LENGTH);
142
143 for(i = 0; i < n; ++i)
144 #ifdef PIPE_ARCH_LITTLE_ENDIAN
145 elems[i] = lp_build_const_int32(gallivm, 2*i);
146 #else
147 elems[i] = lp_build_const_int32(gallivm, 2*i+1);
148 #endif
149
150 return LLVMConstVector(elems, n);
151 }
152
153 /**
154 * Return a vector with elements src[start:start+size]
155 * Most useful for getting half the values out of a 256bit sized vector,
156 * otherwise may cause data rearrangement to happen.
157 */
158 LLVMValueRef
159 lp_build_extract_range(struct gallivm_state *gallivm,
160 LLVMValueRef src,
161 unsigned start,
162 unsigned size)
163 {
164 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
165 unsigned i;
166
167 assert(size <= Elements(elems));
168
169 for (i = 0; i < size; ++i)
170 elems[i] = lp_build_const_int32(gallivm, i + start);
171
172 if (size == 1) {
173 return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
174 }
175 else {
176 return LLVMBuildShuffleVector(gallivm->builder, src, src,
177 LLVMConstVector(elems, size), "");
178 }
179 }
180
181 /**
182 * Concatenates several (must be a power of 2) vectors (of same type)
183 * into a larger one.
184 * Most useful for building up a 256bit sized vector out of two 128bit ones.
185 */
186 LLVMValueRef
187 lp_build_concat(struct gallivm_state *gallivm,
188 LLVMValueRef src[],
189 struct lp_type src_type,
190 unsigned num_vectors)
191 {
192 unsigned new_length, i;
193 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
194 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
195
196 assert(src_type.length * num_vectors <= Elements(shuffles));
197 assert(util_is_power_of_two(num_vectors));
198
199 new_length = src_type.length;
200
201 for (i = 0; i < num_vectors; i++)
202 tmp[i] = src[i];
203
204 while (num_vectors > 1) {
205 num_vectors >>= 1;
206 new_length <<= 1;
207 for (i = 0; i < new_length; i++) {
208 shuffles[i] = lp_build_const_int32(gallivm, i);
209 }
210 for (i = 0; i < num_vectors; i++) {
211 tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
212 LLVMConstVector(shuffles, new_length), "");
213 }
214 }
215
216 return tmp[0];
217 }
218
219
220 /**
221 * Combines vectors to reduce from num_srcs to num_dsts.
222 * Returns the number of src vectors concatenated in a single dst.
223 *
224 * num_srcs must be exactly divisible by num_dsts.
225 *
226 * e.g. For num_srcs = 4 and src = [x, y, z, w]
227 * num_dsts = 1 dst = [xyzw] return = 4
228 * num_dsts = 2 dst = [xy, zw] return = 2
229 */
230 int
231 lp_build_concat_n(struct gallivm_state *gallivm,
232 struct lp_type src_type,
233 LLVMValueRef *src,
234 unsigned num_srcs,
235 LLVMValueRef *dst,
236 unsigned num_dsts)
237 {
238 int size = num_srcs / num_dsts;
239 int i;
240
241 assert(num_srcs >= num_dsts);
242 assert((num_srcs % size) == 0);
243
244 if (num_srcs == num_dsts) {
245 for (i = 0; i < num_dsts; ++i) {
246 dst[i] = src[i];
247 }
248 return 1;
249 }
250
251 for (i = 0; i < num_dsts; ++i) {
252 dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
253 }
254
255 return size;
256 }
257
258
259 /**
260 * Interleave vector elements.
261 *
262 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
263 * (but not for 256bit AVX vectors).
264 */
265 LLVMValueRef
266 lp_build_interleave2(struct gallivm_state *gallivm,
267 struct lp_type type,
268 LLVMValueRef a,
269 LLVMValueRef b,
270 unsigned lo_hi)
271 {
272 LLVMValueRef shuffle;
273
274 if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
275 /*
276 * XXX: This is a workaround for llvm code generation deficiency. Strangely
277 * enough, while this needs vinsertf128/vextractf128 instructions (hence
278 * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
279 * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
280 * So use some different shuffles instead (the exact shuffles don't seem to
281 * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
282 */
283 struct lp_type tmp_type = type;
284 LLVMValueRef srchalf[2], tmpdst;
285 tmp_type.length = 4;
286 tmp_type.width = 64;
287 a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
288 b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
289 srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
290 srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
291 tmp_type.length = 2;
292 tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
293 return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
294 }
295
296 shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
297
298 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
299 }
300
301 /**
302 * Interleave vector elements but with 256 bit,
303 * treats it as interleave with 2 concatenated 128 bit vectors.
304 *
305 * This differs to lp_build_interleave2 as that function would do the following (for lo):
306 * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
307 *
308 *
309 * An example interleave 8x float with 8x float on AVX 256bit unpack:
310 * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
311 *
312 * Equivalent to interleaving 2x 128 bit vectors
313 * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
314 *
315 * So interleave-lo would result in:
316 * a0 b0 a1 b1 a4 b4 a5 b5
317 *
318 * And interleave-hi would result in:
319 * a2 b2 a3 b3 a6 b6 a7 b7
320 */
321 LLVMValueRef
322 lp_build_interleave2_half(struct gallivm_state *gallivm,
323 struct lp_type type,
324 LLVMValueRef a,
325 LLVMValueRef b,
326 unsigned lo_hi)
327 {
328 if (type.length * type.width == 256) {
329 LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
330 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
331 } else {
332 return lp_build_interleave2(gallivm, type, a, b, lo_hi);
333 }
334 }
335
336 /**
337 * Double the bit width.
338 *
339 * This will only change the number of bits the values are represented, not the
340 * values themselves.
341 */
342 void
343 lp_build_unpack2(struct gallivm_state *gallivm,
344 struct lp_type src_type,
345 struct lp_type dst_type,
346 LLVMValueRef src,
347 LLVMValueRef *dst_lo,
348 LLVMValueRef *dst_hi)
349 {
350 LLVMBuilderRef builder = gallivm->builder;
351 LLVMValueRef msb;
352 LLVMTypeRef dst_vec_type;
353
354 assert(!src_type.floating);
355 assert(!dst_type.floating);
356 assert(dst_type.width == src_type.width * 2);
357 assert(dst_type.length * 2 == src_type.length);
358
359 if(dst_type.sign && src_type.sign) {
360 /* Replicate the sign bit in the most significant bits */
361 msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
362 }
363 else
364 /* Most significant bits always zero */
365 msb = lp_build_zero(gallivm, src_type);
366
367 /* Interleave bits */
368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
369 *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
370 *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
371 #else
372 *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
373 *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
374 #endif
375
376 /* Cast the result into the new type (twice as wide) */
377
378 dst_vec_type = lp_build_vec_type(gallivm, dst_type);
379
380 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
381 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
382 }
383
384
385 /**
386 * Expand the bit width.
387 *
388 * This will only change the number of bits the values are represented, not the
389 * values themselves.
390 */
391 void
392 lp_build_unpack(struct gallivm_state *gallivm,
393 struct lp_type src_type,
394 struct lp_type dst_type,
395 LLVMValueRef src,
396 LLVMValueRef *dst, unsigned num_dsts)
397 {
398 unsigned num_tmps;
399 unsigned i;
400
401 /* Register width must remain constant */
402 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
403
404 /* We must not loose or gain channels. Only precision */
405 assert(src_type.length == dst_type.length * num_dsts);
406
407 num_tmps = 1;
408 dst[0] = src;
409
410 while(src_type.width < dst_type.width) {
411 struct lp_type tmp_type = src_type;
412
413 tmp_type.width *= 2;
414 tmp_type.length /= 2;
415
416 for(i = num_tmps; i--; ) {
417 lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
418 }
419
420 src_type = tmp_type;
421
422 num_tmps *= 2;
423 }
424
425 assert(num_tmps == num_dsts);
426 }
427
428
429 /**
430 * Non-interleaved pack.
431 *
432 * This will move values as
433 * (LSB) (MSB)
434 * lo = l0 __ l1 __ l2 __.. __ ln __
435 * hi = h0 __ h1 __ h2 __.. __ hn __
436 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
437 *
438 * This will only change the number of bits the values are represented, not the
439 * values themselves.
440 *
441 * It is assumed the values are already clamped into the destination type range.
442 * Values outside that range will produce undefined results. Use
443 * lp_build_packs2 instead.
444 */
445 LLVMValueRef
446 lp_build_pack2(struct gallivm_state *gallivm,
447 struct lp_type src_type,
448 struct lp_type dst_type,
449 LLVMValueRef lo,
450 LLVMValueRef hi)
451 {
452 LLVMBuilderRef builder = gallivm->builder;
453 LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
454 LLVMValueRef shuffle;
455 LLVMValueRef res = NULL;
456 struct lp_type intr_type = dst_type;
457
458 #if HAVE_LLVM < 0x0207
459 intr_type = src_type;
460 #endif
461
462 assert(!src_type.floating);
463 assert(!dst_type.floating);
464 assert(src_type.width == dst_type.width * 2);
465 assert(src_type.length * 2 == dst_type.length);
466
467 /* Check for special cases first */
468 if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
469 src_type.width * src_type.length >= 128) {
470 const char *intrinsic = NULL;
471
472 switch(src_type.width) {
473 case 32:
474 if (util_cpu_caps.has_sse2) {
475 if(dst_type.sign) {
476 intrinsic = "llvm.x86.sse2.packssdw.128";
477 }
478 else {
479 if (util_cpu_caps.has_sse4_1) {
480 intrinsic = "llvm.x86.sse41.packusdw";
481 #if HAVE_LLVM < 0x0207
482 /* llvm < 2.7 has inconsistent signatures except for packusdw */
483 intr_type = dst_type;
484 #endif
485 }
486 }
487 } else if (util_cpu_caps.has_altivec) {
488 if (dst_type.sign) {
489 intrinsic = "llvm.ppc.altivec.vpkswus";
490 } else {
491 intrinsic = "llvm.ppc.altivec.vpkuwus";
492 }
493 }
494 break;
495 case 16:
496 if (dst_type.sign) {
497 if (util_cpu_caps.has_sse2) {
498 intrinsic = "llvm.x86.sse2.packsswb.128";
499 } else if (util_cpu_caps.has_altivec) {
500 intrinsic = "llvm.ppc.altivec.vpkshss";
501 }
502 } else {
503 if (util_cpu_caps.has_sse2) {
504 intrinsic = "llvm.x86.sse2.packuswb.128";
505 } else if (util_cpu_caps.has_altivec) {
506 intrinsic = "llvm.ppc.altivec.vpkshus";
507 }
508 }
509 break;
510 /* default uses generic shuffle below */
511 }
512 if (intrinsic) {
513 if (src_type.width * src_type.length == 128) {
514 LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
515 res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
516 if (dst_vec_type != intr_vec_type) {
517 res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
518 }
519 }
520 else {
521 int num_split = src_type.width * src_type.length / 128;
522 int i;
523 int nlen = 128 / src_type.width;
524 struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
525 struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
526 LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
527 LLVMValueRef tmplo, tmphi;
528 LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
529 LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
530
531 assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
532
533 for (i = 0; i < num_split / 2; i++) {
534 tmplo = lp_build_extract_range(gallivm,
535 lo, i*nlen*2, nlen);
536 tmphi = lp_build_extract_range(gallivm,
537 lo, i*nlen*2 + nlen, nlen);
538 tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
539 nintr_vec_type, tmplo, tmphi);
540 if (ndst_vec_type != nintr_vec_type) {
541 tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
542 }
543 }
544 for (i = 0; i < num_split / 2; i++) {
545 tmplo = lp_build_extract_range(gallivm,
546 hi, i*nlen*2, nlen);
547 tmphi = lp_build_extract_range(gallivm,
548 hi, i*nlen*2 + nlen, nlen);
549 tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
550 nintr_vec_type,
551 tmplo, tmphi);
552 if (ndst_vec_type != nintr_vec_type) {
553 tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
554 ndst_vec_type, "");
555 }
556 }
557 res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
558 }
559 return res;
560 }
561 }
562
563 /* generic shuffle */
564 lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
565 hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
566
567 shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
568
569 res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
570
571 return res;
572 }
573
574
575
576 /**
577 * Non-interleaved pack and saturate.
578 *
579 * Same as lp_build_pack2 but will saturate values so that they fit into the
580 * destination type.
581 */
582 LLVMValueRef
583 lp_build_packs2(struct gallivm_state *gallivm,
584 struct lp_type src_type,
585 struct lp_type dst_type,
586 LLVMValueRef lo,
587 LLVMValueRef hi)
588 {
589 boolean clamp;
590
591 assert(!src_type.floating);
592 assert(!dst_type.floating);
593 assert(src_type.sign == dst_type.sign);
594 assert(src_type.width == dst_type.width * 2);
595 assert(src_type.length * 2 == dst_type.length);
596
597 clamp = TRUE;
598
599 /* All X86 SSE non-interleaved pack instructions take signed inputs and
600 * saturate them, so no need to clamp for those cases. */
601 if(util_cpu_caps.has_sse2 &&
602 src_type.width * src_type.length >= 128 &&
603 src_type.sign &&
604 (src_type.width == 32 || src_type.width == 16))
605 clamp = FALSE;
606
607 if(clamp) {
608 struct lp_build_context bld;
609 unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
610 LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
611 lp_build_context_init(&bld, gallivm, src_type);
612 lo = lp_build_min(&bld, lo, dst_max);
613 hi = lp_build_min(&bld, hi, dst_max);
614 /* FIXME: What about lower bound? */
615 }
616
617 return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
618 }
619
620
621 /**
622 * Truncate the bit width.
623 *
624 * TODO: Handle saturation consistently.
625 */
626 LLVMValueRef
627 lp_build_pack(struct gallivm_state *gallivm,
628 struct lp_type src_type,
629 struct lp_type dst_type,
630 boolean clamped,
631 const LLVMValueRef *src, unsigned num_srcs)
632 {
633 LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
634 struct lp_type src_type,
635 struct lp_type dst_type,
636 LLVMValueRef lo,
637 LLVMValueRef hi);
638 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
639 unsigned i;
640
641 /* Register width must remain constant */
642 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
643
644 /* We must not loose or gain channels. Only precision */
645 assert(src_type.length * num_srcs == dst_type.length);
646
647 if(clamped)
648 pack2 = &lp_build_pack2;
649 else
650 pack2 = &lp_build_packs2;
651
652 for(i = 0; i < num_srcs; ++i)
653 tmp[i] = src[i];
654
655 while(src_type.width > dst_type.width) {
656 struct lp_type tmp_type = src_type;
657
658 tmp_type.width /= 2;
659 tmp_type.length *= 2;
660
661 /* Take in consideration the sign changes only in the last step */
662 if(tmp_type.width == dst_type.width)
663 tmp_type.sign = dst_type.sign;
664
665 num_srcs /= 2;
666
667 for(i = 0; i < num_srcs; ++i)
668 tmp[i] = pack2(gallivm, src_type, tmp_type,
669 tmp[2*i + 0], tmp[2*i + 1]);
670
671 src_type = tmp_type;
672 }
673
674 assert(num_srcs == 1);
675
676 return tmp[0];
677 }
678
679
680 /**
681 * Truncate or expand the bitwidth.
682 *
683 * NOTE: Getting the right sign flags is crucial here, as we employ some
684 * intrinsics that do saturation.
685 */
686 void
687 lp_build_resize(struct gallivm_state *gallivm,
688 struct lp_type src_type,
689 struct lp_type dst_type,
690 const LLVMValueRef *src, unsigned num_srcs,
691 LLVMValueRef *dst, unsigned num_dsts)
692 {
693 LLVMBuilderRef builder = gallivm->builder;
694 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
695 unsigned i;
696
697 /*
698 * We don't support float <-> int conversion here. That must be done
699 * before/after calling this function.
700 */
701 assert(src_type.floating == dst_type.floating);
702
703 /*
704 * We don't support double <-> float conversion yet, although it could be
705 * added with little effort.
706 */
707 assert((!src_type.floating && !dst_type.floating) ||
708 src_type.width == dst_type.width);
709
710 /* We must not loose or gain channels. Only precision */
711 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
712
713 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
714 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
715 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
716 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
717
718 if (src_type.width > dst_type.width) {
719 /*
720 * Truncate bit width.
721 */
722
723 /* Conversion must be M:1 */
724 assert(num_dsts == 1);
725
726 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
727 /*
728 * Register width remains constant -- use vector packing intrinsics
729 */
730 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
731 }
732 else {
733 if (src_type.width / dst_type.width > num_srcs) {
734 /*
735 * First change src vectors size (with shuffle) so they have the
736 * same size as the destination vector, then pack normally.
737 * Note: cannot use cast/extract because llvm generates atrocious code.
738 */
739 unsigned size_ratio = (src_type.width * src_type.length) /
740 (dst_type.length * dst_type.width);
741 unsigned new_length = src_type.length / size_ratio;
742
743 for (i = 0; i < size_ratio * num_srcs; i++) {
744 unsigned start_index = (i % size_ratio) * new_length;
745 tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
746 start_index, new_length);
747 }
748 num_srcs *= size_ratio;
749 src_type.length = new_length;
750 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
751 }
752 else {
753 /*
754 * Truncate bit width but expand vector size - first pack
755 * then expand simply because this should be more AVX-friendly
756 * for the cases we probably hit.
757 */
758 unsigned size_ratio = (dst_type.width * dst_type.length) /
759 (src_type.length * src_type.width);
760 unsigned num_pack_srcs = num_srcs / size_ratio;
761 dst_type.length = dst_type.length / size_ratio;
762
763 for (i = 0; i < size_ratio; i++) {
764 tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
765 &src[i*num_pack_srcs], num_pack_srcs);
766 }
767 tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
768 }
769 }
770 }
771 else if (src_type.width < dst_type.width) {
772 /*
773 * Expand bit width.
774 */
775
776 /* Conversion must be 1:N */
777 assert(num_srcs == 1);
778
779 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
780 /*
781 * Register width remains constant -- use vector unpack intrinsics
782 */
783 lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
784 }
785 else {
786 /*
787 * Do it element-wise.
788 */
789 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
790
791 for (i = 0; i < num_dsts; i++) {
792 tmp[i] = lp_build_undef(gallivm, dst_type);
793 }
794
795 for (i = 0; i < src_type.length; ++i) {
796 unsigned j = i / dst_type.length;
797 LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
798 LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
799 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
800
801 if (src_type.sign && dst_type.sign) {
802 val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
803 } else {
804 val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
805 }
806 tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
807 }
808 }
809 }
810 else {
811 /*
812 * No-op
813 */
814
815 /* "Conversion" must be N:N */
816 assert(num_srcs == num_dsts);
817
818 for(i = 0; i < num_dsts; ++i)
819 tmp[i] = src[i];
820 }
821
822 for(i = 0; i < num_dsts; ++i)
823 dst[i] = tmp[i];
824 }
825
826
827 /**
828 * Expands src vector from src.length to dst_length
829 */
830 LLVMValueRef
831 lp_build_pad_vector(struct gallivm_state *gallivm,
832 LLVMValueRef src,
833 unsigned dst_length)
834 {
835 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
836 LLVMValueRef undef;
837 LLVMTypeRef type;
838 unsigned i, src_length;
839
840 type = LLVMTypeOf(src);
841
842 if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
843 /* Can't use ShuffleVector on non-vector type */
844 undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
845 return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
846 }
847
848 undef = LLVMGetUndef(type);
849 src_length = LLVMGetVectorSize(type);
850
851 assert(dst_length <= Elements(elems));
852 assert(dst_length >= src_length);
853
854 if (src_length == dst_length)
855 return src;
856
857 /* All elements from src vector */
858 for (i = 0; i < src_length; ++i)
859 elems[i] = lp_build_const_int32(gallivm, i);
860
861 /* Undef fill remaining space */
862 for (i = src_length; i < dst_length; ++i)
863 elems[i] = lp_build_const_int32(gallivm, src_length);
864
865 /* Combine the two vectors */
866 return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
867 }