gallium: implement ARB_texture_query_levels
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_pack.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for packing/unpacking.
32 *
33 * Pack/unpacking is necessary for conversion between types of different
34 * bit width.
35 *
36 * They are also commonly used when an computation needs higher
37 * precision for the intermediate values. For example, if one needs the
38 * function:
39 *
40 * c = compute(a, b);
41 *
42 * to use more precision for intermediate results then one should implement it
43 * as:
44 *
45 * LLVMValueRef
46 * compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
47 * {
48 * struct lp_type wide_type = lp_wider_type(type);
49 * LLVMValueRef al, ah, bl, bh, cl, ch, c;
50 *
51 * lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
52 * lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
53 *
54 * cl = compute_half(al, bl);
55 * ch = compute_half(ah, bh);
56 *
57 * c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
58 *
59 * return c;
60 * }
61 *
62 * where compute_half() would do the computation for half the elements with
63 * twice the precision.
64 *
65 * @author Jose Fonseca <jfonseca@vmware.com>
66 */
67
68
69 #include "util/u_debug.h"
70 #include "util/u_math.h"
71 #include "util/u_cpu_detect.h"
72 #include "util/u_memory.h"
73
74 #include "lp_bld_type.h"
75 #include "lp_bld_const.h"
76 #include "lp_bld_init.h"
77 #include "lp_bld_intr.h"
78 #include "lp_bld_arit.h"
79 #include "lp_bld_pack.h"
80 #include "lp_bld_swizzle.h"
81
82
83 /**
84 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
85 */
86 static LLVMValueRef
87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
88 unsigned n, unsigned lo_hi)
89 {
90 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
91 unsigned i, j;
92
93 assert(n <= LP_MAX_VECTOR_LENGTH);
94 assert(lo_hi < 2);
95
96 /* TODO: cache results in a static table */
97
98 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
99 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
100 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
101 }
102
103 return LLVMConstVector(elems, n);
104 }
105
106 /**
107 * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
108 * See comment above lp_build_interleave2_half for more details.
109 */
110 static LLVMValueRef
111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
112 unsigned n, unsigned lo_hi)
113 {
114 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115 unsigned i, j;
116
117 assert(n <= LP_MAX_VECTOR_LENGTH);
118 assert(lo_hi < 2);
119
120 for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
121 if (i == (n / 2))
122 j += n / 4;
123
124 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
125 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
126 }
127
128 return LLVMConstVector(elems, n);
129 }
130
131 /**
132 * Build shuffle vectors that match PACKxx (SSE) instructions or
133 * VPERM (Altivec).
134 */
135 static LLVMValueRef
136 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
137 {
138 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
139 unsigned i;
140
141 assert(n <= LP_MAX_VECTOR_LENGTH);
142
143 for(i = 0; i < n; ++i)
144 #ifdef PIPE_ARCH_LITTLE_ENDIAN
145 elems[i] = lp_build_const_int32(gallivm, 2*i);
146 #else
147 elems[i] = lp_build_const_int32(gallivm, 2*i+1);
148 #endif
149
150 return LLVMConstVector(elems, n);
151 }
152
153 /**
154 * Return a vector with elements src[start:start+size]
155 * Most useful for getting half the values out of a 256bit sized vector,
156 * otherwise may cause data rearrangement to happen.
157 */
158 LLVMValueRef
159 lp_build_extract_range(struct gallivm_state *gallivm,
160 LLVMValueRef src,
161 unsigned start,
162 unsigned size)
163 {
164 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
165 unsigned i;
166
167 assert(size <= Elements(elems));
168
169 for (i = 0; i < size; ++i)
170 elems[i] = lp_build_const_int32(gallivm, i + start);
171
172 if (size == 1) {
173 return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
174 }
175 else {
176 return LLVMBuildShuffleVector(gallivm->builder, src, src,
177 LLVMConstVector(elems, size), "");
178 }
179 }
180
181 /**
182 * Concatenates several (must be a power of 2) vectors (of same type)
183 * into a larger one.
184 * Most useful for building up a 256bit sized vector out of two 128bit ones.
185 */
186 LLVMValueRef
187 lp_build_concat(struct gallivm_state *gallivm,
188 LLVMValueRef src[],
189 struct lp_type src_type,
190 unsigned num_vectors)
191 {
192 unsigned new_length, i;
193 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
194 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
195
196 assert(src_type.length * num_vectors <= Elements(shuffles));
197 assert(util_is_power_of_two(num_vectors));
198
199 new_length = src_type.length;
200
201 for (i = 0; i < num_vectors; i++)
202 tmp[i] = src[i];
203
204 while (num_vectors > 1) {
205 num_vectors >>= 1;
206 new_length <<= 1;
207 for (i = 0; i < new_length; i++) {
208 shuffles[i] = lp_build_const_int32(gallivm, i);
209 }
210 for (i = 0; i < num_vectors; i++) {
211 tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
212 LLVMConstVector(shuffles, new_length), "");
213 }
214 }
215
216 return tmp[0];
217 }
218
219
220 /**
221 * Combines vectors to reduce from num_srcs to num_dsts.
222 * Returns the number of src vectors concatenated in a single dst.
223 *
224 * num_srcs must be exactly divisible by num_dsts.
225 *
226 * e.g. For num_srcs = 4 and src = [x, y, z, w]
227 * num_dsts = 1 dst = [xyzw] return = 4
228 * num_dsts = 2 dst = [xy, zw] return = 2
229 */
230 int
231 lp_build_concat_n(struct gallivm_state *gallivm,
232 struct lp_type src_type,
233 LLVMValueRef *src,
234 unsigned num_srcs,
235 LLVMValueRef *dst,
236 unsigned num_dsts)
237 {
238 int size = num_srcs / num_dsts;
239 int i;
240
241 assert(num_srcs >= num_dsts);
242 assert((num_srcs % size) == 0);
243
244 if (num_srcs == num_dsts) {
245 for (i = 0; i < num_dsts; ++i) {
246 dst[i] = src[i];
247 }
248 return 1;
249 }
250
251 for (i = 0; i < num_dsts; ++i) {
252 dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
253 }
254
255 return size;
256 }
257
258
259 /**
260 * Interleave vector elements.
261 *
262 * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
263 * (but not for 256bit AVX vectors).
264 */
265 LLVMValueRef
266 lp_build_interleave2(struct gallivm_state *gallivm,
267 struct lp_type type,
268 LLVMValueRef a,
269 LLVMValueRef b,
270 unsigned lo_hi)
271 {
272 LLVMValueRef shuffle;
273
274 if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
275 /*
276 * XXX: This is a workaround for llvm code generation deficiency. Strangely
277 * enough, while this needs vinsertf128/vextractf128 instructions (hence
278 * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
279 * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
280 * So use some different shuffles instead (the exact shuffles don't seem to
281 * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
282 */
283 struct lp_type tmp_type = type;
284 LLVMValueRef srchalf[2], tmpdst;
285 tmp_type.length = 4;
286 tmp_type.width = 64;
287 a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
288 b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
289 srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
290 srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
291 tmp_type.length = 2;
292 tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
293 return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
294 }
295
296 shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
297
298 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
299 }
300
301 /**
302 * Interleave vector elements but with 256 bit,
303 * treats it as interleave with 2 concatenated 128 bit vectors.
304 *
305 * This differs to lp_build_interleave2 as that function would do the following (for lo):
306 * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
307 *
308 *
309 * An example interleave 8x float with 8x float on AVX 256bit unpack:
310 * a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
311 *
312 * Equivalent to interleaving 2x 128 bit vectors
313 * a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
314 *
315 * So interleave-lo would result in:
316 * a0 b0 a1 b1 a4 b4 a5 b5
317 *
318 * And interleave-hi would result in:
319 * a2 b2 a3 b3 a6 b6 a7 b7
320 */
321 LLVMValueRef
322 lp_build_interleave2_half(struct gallivm_state *gallivm,
323 struct lp_type type,
324 LLVMValueRef a,
325 LLVMValueRef b,
326 unsigned lo_hi)
327 {
328 if (type.length * type.width == 256) {
329 LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
330 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
331 } else {
332 return lp_build_interleave2(gallivm, type, a, b, lo_hi);
333 }
334 }
335
336 /**
337 * Double the bit width.
338 *
339 * This will only change the number of bits the values are represented, not the
340 * values themselves.
341 */
342 void
343 lp_build_unpack2(struct gallivm_state *gallivm,
344 struct lp_type src_type,
345 struct lp_type dst_type,
346 LLVMValueRef src,
347 LLVMValueRef *dst_lo,
348 LLVMValueRef *dst_hi)
349 {
350 LLVMBuilderRef builder = gallivm->builder;
351 LLVMValueRef msb;
352 LLVMTypeRef dst_vec_type;
353
354 assert(!src_type.floating);
355 assert(!dst_type.floating);
356 assert(dst_type.width == src_type.width * 2);
357 assert(dst_type.length * 2 == src_type.length);
358
359 if(dst_type.sign && src_type.sign) {
360 /* Replicate the sign bit in the most significant bits */
361 msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
362 }
363 else
364 /* Most significant bits always zero */
365 msb = lp_build_zero(gallivm, src_type);
366
367 /* Interleave bits */
368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
369 *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
370 *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
371 #else
372 *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
373 *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
374 #endif
375
376 /* Cast the result into the new type (twice as wide) */
377
378 dst_vec_type = lp_build_vec_type(gallivm, dst_type);
379
380 *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
381 *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
382 }
383
384
385 /**
386 * Expand the bit width.
387 *
388 * This will only change the number of bits the values are represented, not the
389 * values themselves.
390 */
391 void
392 lp_build_unpack(struct gallivm_state *gallivm,
393 struct lp_type src_type,
394 struct lp_type dst_type,
395 LLVMValueRef src,
396 LLVMValueRef *dst, unsigned num_dsts)
397 {
398 unsigned num_tmps;
399 unsigned i;
400
401 /* Register width must remain constant */
402 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
403
404 /* We must not loose or gain channels. Only precision */
405 assert(src_type.length == dst_type.length * num_dsts);
406
407 num_tmps = 1;
408 dst[0] = src;
409
410 while(src_type.width < dst_type.width) {
411 struct lp_type tmp_type = src_type;
412
413 tmp_type.width *= 2;
414 tmp_type.length /= 2;
415
416 for(i = num_tmps; i--; ) {
417 lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
418 }
419
420 src_type = tmp_type;
421
422 num_tmps *= 2;
423 }
424
425 assert(num_tmps == num_dsts);
426 }
427
428
429 /**
430 * Non-interleaved pack.
431 *
432 * This will move values as
433 * (LSB) (MSB)
434 * lo = l0 __ l1 __ l2 __.. __ ln __
435 * hi = h0 __ h1 __ h2 __.. __ hn __
436 * res = l0 l1 l2 .. ln h0 h1 h2 .. hn
437 *
438 * This will only change the number of bits the values are represented, not the
439 * values themselves.
440 *
441 * It is assumed the values are already clamped into the destination type range.
442 * Values outside that range will produce undefined results. Use
443 * lp_build_packs2 instead.
444 */
445 LLVMValueRef
446 lp_build_pack2(struct gallivm_state *gallivm,
447 struct lp_type src_type,
448 struct lp_type dst_type,
449 LLVMValueRef lo,
450 LLVMValueRef hi)
451 {
452 LLVMBuilderRef builder = gallivm->builder;
453 LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
454 LLVMValueRef shuffle;
455 LLVMValueRef res = NULL;
456 struct lp_type intr_type = dst_type;
457
458 assert(!src_type.floating);
459 assert(!dst_type.floating);
460 assert(src_type.width == dst_type.width * 2);
461 assert(src_type.length * 2 == dst_type.length);
462
463 /* Check for special cases first */
464 if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
465 src_type.width * src_type.length >= 128) {
466 const char *intrinsic = NULL;
467
468 switch(src_type.width) {
469 case 32:
470 if (util_cpu_caps.has_sse2) {
471 if(dst_type.sign) {
472 intrinsic = "llvm.x86.sse2.packssdw.128";
473 }
474 else {
475 if (util_cpu_caps.has_sse4_1) {
476 intrinsic = "llvm.x86.sse41.packusdw";
477 }
478 }
479 } else if (util_cpu_caps.has_altivec) {
480 if (dst_type.sign) {
481 intrinsic = "llvm.ppc.altivec.vpkswus";
482 } else {
483 intrinsic = "llvm.ppc.altivec.vpkuwus";
484 }
485 }
486 break;
487 case 16:
488 if (dst_type.sign) {
489 if (util_cpu_caps.has_sse2) {
490 intrinsic = "llvm.x86.sse2.packsswb.128";
491 } else if (util_cpu_caps.has_altivec) {
492 intrinsic = "llvm.ppc.altivec.vpkshss";
493 }
494 } else {
495 if (util_cpu_caps.has_sse2) {
496 intrinsic = "llvm.x86.sse2.packuswb.128";
497 } else if (util_cpu_caps.has_altivec) {
498 intrinsic = "llvm.ppc.altivec.vpkshus";
499 }
500 }
501 break;
502 /* default uses generic shuffle below */
503 }
504 if (intrinsic) {
505 if (src_type.width * src_type.length == 128) {
506 LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
507 res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
508 if (dst_vec_type != intr_vec_type) {
509 res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
510 }
511 }
512 else {
513 int num_split = src_type.width * src_type.length / 128;
514 int i;
515 int nlen = 128 / src_type.width;
516 struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
517 struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
518 LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
519 LLVMValueRef tmplo, tmphi;
520 LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
521 LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
522
523 assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
524
525 for (i = 0; i < num_split / 2; i++) {
526 tmplo = lp_build_extract_range(gallivm,
527 lo, i*nlen*2, nlen);
528 tmphi = lp_build_extract_range(gallivm,
529 lo, i*nlen*2 + nlen, nlen);
530 tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
531 nintr_vec_type, tmplo, tmphi);
532 if (ndst_vec_type != nintr_vec_type) {
533 tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
534 }
535 }
536 for (i = 0; i < num_split / 2; i++) {
537 tmplo = lp_build_extract_range(gallivm,
538 hi, i*nlen*2, nlen);
539 tmphi = lp_build_extract_range(gallivm,
540 hi, i*nlen*2 + nlen, nlen);
541 tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
542 nintr_vec_type,
543 tmplo, tmphi);
544 if (ndst_vec_type != nintr_vec_type) {
545 tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
546 ndst_vec_type, "");
547 }
548 }
549 res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
550 }
551 return res;
552 }
553 }
554
555 /* generic shuffle */
556 lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
557 hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
558
559 shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
560
561 res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
562
563 return res;
564 }
565
566
567
568 /**
569 * Non-interleaved pack and saturate.
570 *
571 * Same as lp_build_pack2 but will saturate values so that they fit into the
572 * destination type.
573 */
574 LLVMValueRef
575 lp_build_packs2(struct gallivm_state *gallivm,
576 struct lp_type src_type,
577 struct lp_type dst_type,
578 LLVMValueRef lo,
579 LLVMValueRef hi)
580 {
581 boolean clamp;
582
583 assert(!src_type.floating);
584 assert(!dst_type.floating);
585 assert(src_type.sign == dst_type.sign);
586 assert(src_type.width == dst_type.width * 2);
587 assert(src_type.length * 2 == dst_type.length);
588
589 clamp = TRUE;
590
591 /* All X86 SSE non-interleaved pack instructions take signed inputs and
592 * saturate them, so no need to clamp for those cases. */
593 if(util_cpu_caps.has_sse2 &&
594 src_type.width * src_type.length >= 128 &&
595 src_type.sign &&
596 (src_type.width == 32 || src_type.width == 16))
597 clamp = FALSE;
598
599 if(clamp) {
600 struct lp_build_context bld;
601 unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
602 LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
603 lp_build_context_init(&bld, gallivm, src_type);
604 lo = lp_build_min(&bld, lo, dst_max);
605 hi = lp_build_min(&bld, hi, dst_max);
606 /* FIXME: What about lower bound? */
607 }
608
609 return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
610 }
611
612
613 /**
614 * Truncate the bit width.
615 *
616 * TODO: Handle saturation consistently.
617 */
618 LLVMValueRef
619 lp_build_pack(struct gallivm_state *gallivm,
620 struct lp_type src_type,
621 struct lp_type dst_type,
622 boolean clamped,
623 const LLVMValueRef *src, unsigned num_srcs)
624 {
625 LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
626 struct lp_type src_type,
627 struct lp_type dst_type,
628 LLVMValueRef lo,
629 LLVMValueRef hi);
630 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
631 unsigned i;
632
633 /* Register width must remain constant */
634 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
635
636 /* We must not loose or gain channels. Only precision */
637 assert(src_type.length * num_srcs == dst_type.length);
638
639 if(clamped)
640 pack2 = &lp_build_pack2;
641 else
642 pack2 = &lp_build_packs2;
643
644 for(i = 0; i < num_srcs; ++i)
645 tmp[i] = src[i];
646
647 while(src_type.width > dst_type.width) {
648 struct lp_type tmp_type = src_type;
649
650 tmp_type.width /= 2;
651 tmp_type.length *= 2;
652
653 /* Take in consideration the sign changes only in the last step */
654 if(tmp_type.width == dst_type.width)
655 tmp_type.sign = dst_type.sign;
656
657 num_srcs /= 2;
658
659 for(i = 0; i < num_srcs; ++i)
660 tmp[i] = pack2(gallivm, src_type, tmp_type,
661 tmp[2*i + 0], tmp[2*i + 1]);
662
663 src_type = tmp_type;
664 }
665
666 assert(num_srcs == 1);
667
668 return tmp[0];
669 }
670
671
672 /**
673 * Truncate or expand the bitwidth.
674 *
675 * NOTE: Getting the right sign flags is crucial here, as we employ some
676 * intrinsics that do saturation.
677 */
678 void
679 lp_build_resize(struct gallivm_state *gallivm,
680 struct lp_type src_type,
681 struct lp_type dst_type,
682 const LLVMValueRef *src, unsigned num_srcs,
683 LLVMValueRef *dst, unsigned num_dsts)
684 {
685 LLVMBuilderRef builder = gallivm->builder;
686 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
687 unsigned i;
688
689 /*
690 * We don't support float <-> int conversion here. That must be done
691 * before/after calling this function.
692 */
693 assert(src_type.floating == dst_type.floating);
694
695 /*
696 * We don't support double <-> float conversion yet, although it could be
697 * added with little effort.
698 */
699 assert((!src_type.floating && !dst_type.floating) ||
700 src_type.width == dst_type.width);
701
702 /* We must not loose or gain channels. Only precision */
703 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
704
705 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
706 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
707 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
708 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
709
710 if (src_type.width > dst_type.width) {
711 /*
712 * Truncate bit width.
713 */
714
715 /* Conversion must be M:1 */
716 assert(num_dsts == 1);
717
718 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
719 /*
720 * Register width remains constant -- use vector packing intrinsics
721 */
722 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
723 }
724 else {
725 if (src_type.width / dst_type.width > num_srcs) {
726 /*
727 * First change src vectors size (with shuffle) so they have the
728 * same size as the destination vector, then pack normally.
729 * Note: cannot use cast/extract because llvm generates atrocious code.
730 */
731 unsigned size_ratio = (src_type.width * src_type.length) /
732 (dst_type.length * dst_type.width);
733 unsigned new_length = src_type.length / size_ratio;
734
735 for (i = 0; i < size_ratio * num_srcs; i++) {
736 unsigned start_index = (i % size_ratio) * new_length;
737 tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
738 start_index, new_length);
739 }
740 num_srcs *= size_ratio;
741 src_type.length = new_length;
742 tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
743 }
744 else {
745 /*
746 * Truncate bit width but expand vector size - first pack
747 * then expand simply because this should be more AVX-friendly
748 * for the cases we probably hit.
749 */
750 unsigned size_ratio = (dst_type.width * dst_type.length) /
751 (src_type.length * src_type.width);
752 unsigned num_pack_srcs = num_srcs / size_ratio;
753 dst_type.length = dst_type.length / size_ratio;
754
755 for (i = 0; i < size_ratio; i++) {
756 tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
757 &src[i*num_pack_srcs], num_pack_srcs);
758 }
759 tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
760 }
761 }
762 }
763 else if (src_type.width < dst_type.width) {
764 /*
765 * Expand bit width.
766 */
767
768 /* Conversion must be 1:N */
769 assert(num_srcs == 1);
770
771 if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
772 /*
773 * Register width remains constant -- use vector unpack intrinsics
774 */
775 lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
776 }
777 else {
778 /*
779 * Do it element-wise.
780 */
781 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
782
783 for (i = 0; i < num_dsts; i++) {
784 tmp[i] = lp_build_undef(gallivm, dst_type);
785 }
786
787 for (i = 0; i < src_type.length; ++i) {
788 unsigned j = i / dst_type.length;
789 LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
790 LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
791 LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
792
793 if (src_type.sign && dst_type.sign) {
794 val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
795 } else {
796 val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
797 }
798 tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
799 }
800 }
801 }
802 else {
803 /*
804 * No-op
805 */
806
807 /* "Conversion" must be N:N */
808 assert(num_srcs == num_dsts);
809
810 for(i = 0; i < num_dsts; ++i)
811 tmp[i] = src[i];
812 }
813
814 for(i = 0; i < num_dsts; ++i)
815 dst[i] = tmp[i];
816 }
817
818
819 /**
820 * Expands src vector from src.length to dst_length
821 */
822 LLVMValueRef
823 lp_build_pad_vector(struct gallivm_state *gallivm,
824 LLVMValueRef src,
825 unsigned dst_length)
826 {
827 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
828 LLVMValueRef undef;
829 LLVMTypeRef type;
830 unsigned i, src_length;
831
832 type = LLVMTypeOf(src);
833
834 if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
835 /* Can't use ShuffleVector on non-vector type */
836 undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
837 return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
838 }
839
840 undef = LLVMGetUndef(type);
841 src_length = LLVMGetVectorSize(type);
842
843 assert(dst_length <= Elements(elems));
844 assert(dst_length >= src_length);
845
846 if (src_length == dst_length)
847 return src;
848
849 /* All elements from src vector */
850 for (i = 0; i < src_length; ++i)
851 elems[i] = lp_build_const_int32(gallivm, i);
852
853 /* Undef fill remaining space */
854 for (i = src_length; i < dst_length; ++i)
855 elems[i] = lp_build_const_int32(gallivm, src_length);
856
857 /* Combine the two vectors */
858 return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
859 }