1f21fa030077d9400de2daa097c626459ef83977
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_soa.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_defines.h"
30
31 #include "util/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 #include "util/u_math.h"
35
36 #include "lp_bld_type.h"
37 #include "lp_bld_const.h"
38 #include "lp_bld_conv.h"
39 #include "lp_bld_swizzle.h"
40 #include "lp_bld_gather.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_format.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_pack.h"
45 #include "lp_bld_flow.h"
46 #include "lp_bld_printf.h"
47 #include "lp_bld_intr.h"
48
49 static void
50 convert_to_soa(struct gallivm_state *gallivm,
51 LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
52 LLVMValueRef dst_soa[4],
53 const struct lp_type soa_type)
54 {
55 unsigned j, k;
56 struct lp_type aos_channel_type = soa_type;
57
58 LLVMValueRef aos_channels[4];
59 unsigned pixels_per_channel = soa_type.length / 4;
60
61 debug_assert((soa_type.length % 4) == 0);
62
63 aos_channel_type.length >>= 1;
64
65 for (j = 0; j < 4; ++j) {
66 LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
67
68 assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
69
70 for (k = 0; k < pixels_per_channel; ++k) {
71 channel[k] = src_aos[j + 4 * k];
72 }
73
74 aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
75 }
76
77 lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
78 }
79
80
81 void
82 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
83 struct lp_build_context *bld,
84 const LLVMValueRef *unswizzled,
85 LLVMValueRef swizzled_out[4])
86 {
87 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
88 enum pipe_swizzle swizzle;
89 LLVMValueRef depth_or_stencil;
90
91 if (util_format_has_stencil(format_desc) &&
92 !util_format_has_depth(format_desc)) {
93 assert(!bld->type.floating);
94 swizzle = format_desc->swizzle[1];
95 }
96 else {
97 assert(bld->type.floating);
98 swizzle = format_desc->swizzle[0];
99 }
100 /*
101 * Return zzz1 or sss1 for depth-stencil formats here.
102 * Correct swizzling will be handled by apply_sampler_swizzle() later.
103 */
104 depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
105
106 swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
107 swizzled_out[3] = bld->one;
108 }
109 else {
110 unsigned chan;
111 for (chan = 0; chan < 4; ++chan) {
112 enum pipe_swizzle swizzle = format_desc->swizzle[chan];
113 swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
114 }
115 }
116 }
117
118
119
120 static LLVMValueRef
121 lp_build_extract_soa_chan(struct lp_build_context *bld,
122 unsigned blockbits,
123 boolean srgb_chan,
124 struct util_format_channel_description chan_desc,
125 LLVMValueRef packed)
126 {
127 struct gallivm_state *gallivm = bld->gallivm;
128 LLVMBuilderRef builder = gallivm->builder;
129 struct lp_type type = bld->type;
130 LLVMValueRef input = packed;
131 const unsigned width = chan_desc.size;
132 const unsigned start = chan_desc.shift;
133 const unsigned stop = start + width;
134
135 /* Decode the input vector component */
136
137 switch(chan_desc.type) {
138 case UTIL_FORMAT_TYPE_VOID:
139 input = bld->undef;
140 break;
141
142 case UTIL_FORMAT_TYPE_UNSIGNED:
143 /*
144 * Align the LSB
145 */
146 if (start) {
147 input = LLVMBuildLShr(builder, input,
148 lp_build_const_int_vec(gallivm, type, start), "");
149 }
150
151 /*
152 * Zero the MSBs
153 */
154 if (stop < blockbits) {
155 unsigned mask = ((unsigned long long)1 << width) - 1;
156 input = LLVMBuildAnd(builder, input,
157 lp_build_const_int_vec(gallivm, type, mask), "");
158 }
159
160 /*
161 * Type conversion
162 */
163 if (type.floating) {
164 if (srgb_chan) {
165 struct lp_type conv_type = lp_uint_type(type);
166 input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
167 }
168 else {
169 if(chan_desc.normalized)
170 input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
171 else
172 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
173 }
174 }
175 else if (chan_desc.pure_integer) {
176 /* Nothing to do */
177 } else {
178 /* FIXME */
179 assert(0);
180 }
181 break;
182
183 case UTIL_FORMAT_TYPE_SIGNED:
184 /*
185 * Align the sign bit first.
186 */
187 if (stop < type.width) {
188 unsigned bits = type.width - stop;
189 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
190 input = LLVMBuildShl(builder, input, bits_val, "");
191 }
192
193 /*
194 * Align the LSB (with an arithmetic shift to preserve the sign)
195 */
196 if (chan_desc.size < type.width) {
197 unsigned bits = type.width - chan_desc.size;
198 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
199 input = LLVMBuildAShr(builder, input, bits_val, "");
200 }
201
202 /*
203 * Type conversion
204 */
205 if (type.floating) {
206 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
207 if (chan_desc.normalized) {
208 double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
209 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
210 input = LLVMBuildFMul(builder, input, scale_val, "");
211 /*
212 * The formula above will produce value below -1.0 for most negative
213 * value but everything seems happy with that hence disable for now.
214 */
215 if (0)
216 input = lp_build_max(bld, input,
217 lp_build_const_vec(gallivm, type, -1.0f));
218 }
219 }
220 else if (chan_desc.pure_integer) {
221 /* Nothing to do */
222 } else {
223 /* FIXME */
224 assert(0);
225 }
226 break;
227
228 case UTIL_FORMAT_TYPE_FLOAT:
229 if (type.floating) {
230 if (chan_desc.size == 16) {
231 struct lp_type f16i_type = type;
232 f16i_type.width /= 2;
233 f16i_type.floating = 0;
234 if (start) {
235 input = LLVMBuildLShr(builder, input,
236 lp_build_const_int_vec(gallivm, type, start), "");
237 }
238 input = LLVMBuildTrunc(builder, input,
239 lp_build_vec_type(gallivm, f16i_type), "");
240 input = lp_build_half_to_float(gallivm, input);
241 } else {
242 assert(start == 0);
243 assert(stop == 32);
244 assert(type.width == 32);
245 }
246 input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
247 }
248 else {
249 /* FIXME */
250 assert(0);
251 input = bld->undef;
252 }
253 break;
254
255 case UTIL_FORMAT_TYPE_FIXED:
256 if (type.floating) {
257 double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
258 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
259 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
260 input = LLVMBuildFMul(builder, input, scale_val, "");
261 }
262 else {
263 /* FIXME */
264 assert(0);
265 input = bld->undef;
266 }
267 break;
268
269 default:
270 assert(0);
271 input = bld->undef;
272 break;
273 }
274
275 return input;
276 }
277
278
279 /**
280 * Unpack several pixels in SoA.
281 *
282 * It takes a vector of packed pixels:
283 *
284 * packed = {P0, P1, P2, P3, ..., Pn}
285 *
286 * And will produce four vectors:
287 *
288 * red = {R0, R1, R2, R3, ..., Rn}
289 * green = {G0, G1, G2, G3, ..., Gn}
290 * blue = {B0, B1, B2, B3, ..., Bn}
291 * alpha = {A0, A1, A2, A3, ..., An}
292 *
293 * It requires that a packed pixel fits into an element of the output
294 * channels. The common case is when converting pixel with a depth of 32 bit or
295 * less into floats.
296 *
297 * \param format_desc the format of the 'packed' incoming pixel vector
298 * \param type the desired type for rgba_out (type.length = n, above)
299 * \param packed the incoming vector of packed pixels
300 * \param rgba_out returns the SoA R,G,B,A vectors
301 */
302 void
303 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
304 const struct util_format_description *format_desc,
305 struct lp_type type,
306 LLVMValueRef packed,
307 LLVMValueRef rgba_out[4])
308 {
309 struct lp_build_context bld;
310 LLVMValueRef inputs[4];
311 unsigned chan;
312
313 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
314 assert(format_desc->block.width == 1);
315 assert(format_desc->block.height == 1);
316 assert(format_desc->block.bits <= type.width);
317 /* FIXME: Support more output types */
318 assert(type.width == 32);
319
320 lp_build_context_init(&bld, gallivm, type);
321
322 /* Decode the input vector components */
323 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
324 struct util_format_channel_description chan_desc = format_desc->channel[chan];
325 boolean srgb_chan = FALSE;
326
327 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
328 format_desc->swizzle[3] != chan) {
329 srgb_chan = TRUE;
330 }
331
332 inputs[chan] = lp_build_extract_soa_chan(&bld,
333 format_desc->block.bits,
334 srgb_chan,
335 chan_desc,
336 packed);
337 }
338
339 lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
340 }
341
342
343 /**
344 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
345 *
346 * \param dst_type The desired return type. For pure integer formats
347 * this should be a 32bit wide int or uint vector type,
348 * otherwise a float vector type.
349 *
350 * \param packed The rgba8 values to pack.
351 *
352 * \param rgba The 4 SoA return vectors.
353 */
354 void
355 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
356 struct lp_type dst_type,
357 LLVMValueRef packed,
358 LLVMValueRef *rgba)
359 {
360 LLVMBuilderRef builder = gallivm->builder;
361 LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
362 unsigned chan;
363
364 /* XXX technically shouldn't use that for uint dst_type */
365 packed = LLVMBuildBitCast(builder, packed,
366 lp_build_int_vec_type(gallivm, dst_type), "");
367
368 /* Decode the input vector components */
369 for (chan = 0; chan < 4; ++chan) {
370 #ifdef PIPE_ARCH_LITTLE_ENDIAN
371 unsigned start = chan*8;
372 #else
373 unsigned start = (3-chan)*8;
374 #endif
375 unsigned stop = start + 8;
376 LLVMValueRef input;
377
378 input = packed;
379
380 if (start)
381 input = LLVMBuildLShr(builder, input,
382 lp_build_const_int_vec(gallivm, dst_type, start), "");
383
384 if (stop < 32)
385 input = LLVMBuildAnd(builder, input, mask, "");
386
387 if (dst_type.floating)
388 input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
389
390 rgba[chan] = input;
391 }
392 }
393
394
395
396 /**
397 * Fetch a texels from a texture, returning them in SoA layout.
398 *
399 * \param type the desired return type for 'rgba'. The vector length
400 * is the number of texels to fetch
401 * \param aligned if the offset is guaranteed to be aligned to element width
402 *
403 * \param base_ptr points to the base of the texture mip tree.
404 * \param offset offset to start of the texture image block. For non-
405 * compressed formats, this simply is an offset to the texel.
406 * For compressed formats, it is an offset to the start of the
407 * compressed data block.
408 *
409 * \param i, j the sub-block pixel coordinates. For non-compressed formats
410 * these will always be (0,0). For compressed formats, i will
411 * be in [0, block_width-1] and j will be in [0, block_height-1].
412 * \param cache optional value pointing to a lp_build_format_cache structure
413 */
414 void
415 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
416 const struct util_format_description *format_desc,
417 struct lp_type type,
418 boolean aligned,
419 LLVMValueRef base_ptr,
420 LLVMValueRef offset,
421 LLVMValueRef i,
422 LLVMValueRef j,
423 LLVMValueRef cache,
424 LLVMValueRef rgba_out[4])
425 {
426 LLVMBuilderRef builder = gallivm->builder;
427 enum pipe_format format = format_desc->format;
428 struct lp_type fetch_type;
429
430 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
431 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
432 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
433 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
434 format_desc->block.width == 1 &&
435 format_desc->block.height == 1 &&
436 format_desc->block.bits <= type.width &&
437 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
438 format_desc->channel[0].size == 32 ||
439 format_desc->channel[0].size == 16))
440 {
441 /*
442 * The packed pixel fits into an element of the destination format. Put
443 * the packed pixels into a vector and extract each component for all
444 * vector elements in parallel.
445 */
446
447 LLVMValueRef packed;
448
449 /*
450 * gather the texels from the texture
451 * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
452 */
453 assert(format_desc->block.bits <= type.width);
454 fetch_type = lp_type_uint(type.width);
455 packed = lp_build_gather(gallivm,
456 type.length,
457 format_desc->block.bits,
458 fetch_type,
459 aligned,
460 base_ptr, offset, FALSE);
461
462 /*
463 * convert texels to float rgba
464 */
465 lp_build_unpack_rgba_soa(gallivm,
466 format_desc,
467 type,
468 packed, rgba_out);
469 return;
470 }
471
472
473 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
474 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
475 format_desc->block.width == 1 &&
476 format_desc->block.height == 1 &&
477 format_desc->block.bits > type.width &&
478 ((format_desc->block.bits <= type.width * type.length &&
479 format_desc->channel[0].size <= type.width) ||
480 (format_desc->channel[0].size == 64 &&
481 format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
482 type.floating)))
483 {
484 /*
485 * Similar to above, but the packed pixel is larger than what fits
486 * into an element of the destination format. The packed pixels will be
487 * shuffled into SoA vectors appropriately, and then the extraction will
488 * be done in parallel as much as possible.
489 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
490 * the gathered vectors can be shuffled easily (even with avx).
491 * 64xn float -> 32xn float is handled too but it's a bit special as
492 * it does the conversion pre-shuffle.
493 */
494
495 LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
496 struct lp_type fetch_type, gather_type = type;
497 unsigned num_gather, fetch_width, i, j;
498 struct lp_build_context bld;
499 boolean fp64 = format_desc->channel[0].size == 64;
500
501 lp_build_context_init(&bld, gallivm, type);
502
503 assert(type.width == 32);
504 assert(format_desc->block.bits > type.width);
505
506 /*
507 * First, figure out fetch order.
508 */
509 fetch_width = util_next_power_of_two(format_desc->block.bits);
510 /*
511 * fp64 are treated like fp32 except we fetch twice wide values
512 * (as we shuffle after trunc). The shuffles for that work out
513 * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
514 * albeit we miss the potential opportunity for hw gather (as it
515 * only handles native size).
516 */
517 num_gather = fetch_width / type.width;
518 gather_type.width *= num_gather;
519 if (fp64) {
520 num_gather /= 2;
521 }
522 gather_type.length /= num_gather;
523
524 for (i = 0; i < num_gather; i++) {
525 LLVMValueRef offsetr, shuf_vec;
526 if(num_gather == 4) {
527 for (j = 0; j < gather_type.length; j++) {
528 unsigned idx = i + 4*j;
529 shuffles[j] = lp_build_const_int32(gallivm, idx);
530 }
531 shuf_vec = LLVMConstVector(shuffles, gather_type.length);
532 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
533
534 }
535 else if (num_gather == 2) {
536 assert(num_gather == 2);
537 for (j = 0; j < gather_type.length; j++) {
538 unsigned idx = i*2 + (j%2) + (j/2)*4;
539 shuffles[j] = lp_build_const_int32(gallivm, idx);
540 }
541 shuf_vec = LLVMConstVector(shuffles, gather_type.length);
542 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
543 }
544 else {
545 assert(num_gather == 1);
546 offsetr = offset;
547 }
548 if (gather_type.length == 1) {
549 LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
550 offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
551 }
552
553 /*
554 * Determine whether to use float or int loads. This is mostly
555 * to outsmart the (stupid) llvm int/float shuffle logic, we
556 * don't really care much if the data is floats or ints...
557 * But llvm will refuse to use single float shuffle with int data
558 * and instead use 3 int shuffles instead, the code looks atrocious.
559 * (Note bitcasts often won't help, as llvm is too smart to be
560 * fooled by that.)
561 * Nobody cares about simd float<->int domain transition penalties,
562 * which usually don't even exist for shuffles anyway.
563 * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
564 * going into transpose, which is unpacks, so doesn't really matter
565 * much).
566 * With 2x32bit or 4x16bit fetch, we use float vec, since those
567 * go into the weird channel separation shuffle. With floats,
568 * this is (with 128bit vectors):
569 * - 2 movq, 2 movhpd, 2 shufps
570 * With ints it would be:
571 * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
572 * I've seen texture functions increase in code size by 15% just due
573 * to that (there's lots of such fetches in them...)
574 * (We could chose a different gather order to improve this somewhat
575 * for the int path, but it would basically just drop the blends,
576 * so the float path with this order really is optimal.)
577 * Albeit it is tricky sometimes llvm doesn't ignore the float->int
578 * casts so must avoid them until we're done with the float shuffle...
579 * 3x16bit formats (the same is also true for 3x8) are pretty bad but
580 * there's nothing we can do about them (we could overallocate by
581 * those couple bytes and use unaligned but pot sized load).
582 * Note that this is very much x86 specific. I don't know if this
583 * affect other archs at all.
584 */
585 if (num_gather > 1) {
586 /*
587 * We always want some float type here (with x86)
588 * due to shuffles being float ones afterwards (albeit for
589 * the num_gather == 4 case int should work fine too
590 * (unless there's some problems with avx but not avx2).
591 */
592 if (format_desc->channel[0].size == 64) {
593 fetch_type = lp_type_float_vec(64, gather_type.width);
594 } else {
595 fetch_type = lp_type_int_vec(32, gather_type.width);
596 }
597 }
598 else {
599 /* type doesn't matter much */
600 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
601 (format_desc->channel[0].size == 32 ||
602 format_desc->channel[0].size == 64)) {
603 fetch_type = lp_type_float(gather_type.width);
604 } else {
605 fetch_type = lp_type_uint(gather_type.width);
606 }
607 }
608
609 /* Now finally gather the values */
610 packed[i] = lp_build_gather(gallivm, gather_type.length,
611 format_desc->block.bits,
612 fetch_type, aligned,
613 base_ptr, offsetr, FALSE);
614 if (fp64) {
615 struct lp_type conv_type = type;
616 conv_type.width *= 2;
617 packed[i] = LLVMBuildBitCast(builder, packed[i],
618 lp_build_vec_type(gallivm, conv_type), "");
619 packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
620 }
621 }
622
623 /* shuffle the gathered values to SoA */
624 if (num_gather == 2) {
625 for (i = 0; i < num_gather; i++) {
626 for (j = 0; j < type.length; j++) {
627 unsigned idx = (j%2)*2 + (j/4)*4 + i;
628 if ((j/2)%2)
629 idx += type.length;
630 shuffles[j] = lp_build_const_int32(gallivm, idx);
631 }
632 dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
633 LLVMConstVector(shuffles, type.length), "");
634 }
635 }
636 else if (num_gather == 4) {
637 lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
638 }
639 else {
640 assert(num_gather == 1);
641 dst[0] = packed[0];
642 }
643
644 /*
645 * And finally unpack exactly as above, except that
646 * chan shift is adjusted and the right vector selected.
647 */
648 if (!fp64) {
649 for (i = 0; i < num_gather; i++) {
650 dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
651 }
652 for (i = 0; i < format_desc->nr_channels; i++) {
653 struct util_format_channel_description chan_desc = format_desc->channel[i];
654 unsigned blockbits = type.width;
655 unsigned vec_nr;
656
657 #ifdef PIPE_ARCH_BIG_ENDIAN
658 vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
659 #else
660 vec_nr = chan_desc.shift / type.width;
661 #endif
662 chan_desc.shift %= type.width;
663
664 output[i] = lp_build_extract_soa_chan(&bld,
665 blockbits,
666 FALSE,
667 chan_desc,
668 dst[vec_nr]);
669 }
670 }
671 else {
672 for (i = 0; i < format_desc->nr_channels; i++) {
673 output[i] = dst[i];
674 }
675 }
676
677 lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
678 return;
679 }
680
681 if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
682 format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
683 /*
684 * similar conceptually to above but requiring special
685 * AoS packed -> SoA float conversion code.
686 */
687 LLVMValueRef packed;
688 struct lp_type fetch_type = lp_type_uint(type.width);
689
690 assert(type.floating);
691 assert(type.width == 32);
692
693 packed = lp_build_gather(gallivm, type.length,
694 format_desc->block.bits,
695 fetch_type, aligned,
696 base_ptr, offset, FALSE);
697 if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
698 lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
699 }
700 else {
701 lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
702 }
703 return;
704 }
705
706 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
707 format_desc->block.bits == 64) {
708 /*
709 * special case the format is 64 bits but we only require
710 * 32bit (or 8bit) from each block.
711 */
712 LLVMValueRef packed;
713 struct lp_type fetch_type = lp_type_uint(type.width);
714
715 if (format == PIPE_FORMAT_X32_S8X24_UINT) {
716 /*
717 * for stencil simply fix up offsets - could in fact change
718 * base_ptr instead even outside the shader.
719 */
720 unsigned mask = (1 << 8) - 1;
721 LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
722 offset = LLVMBuildAdd(builder, offset, s_offset, "");
723 packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
724 aligned, base_ptr, offset, FALSE);
725 packed = LLVMBuildAnd(builder, packed,
726 lp_build_const_int_vec(gallivm, type, mask), "");
727 }
728 else {
729 assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
730 packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
731 aligned, base_ptr, offset, TRUE);
732 packed = LLVMBuildBitCast(builder, packed,
733 lp_build_vec_type(gallivm, type), "");
734 }
735 /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
736 rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
737 rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
738 return;
739 }
740
741 /*
742 * Try calling lp_build_fetch_rgba_aos for all pixels.
743 * Should only really hit subsampled, compressed
744 * (for s3tc srgb too, for rgtc the unorm ones only) by now.
745 * (This is invalid for plain 8unorm formats because we're lazy with
746 * the swizzle since some results would arrive swizzled, some not.)
747 */
748
749 if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
750 (util_format_fits_8unorm(format_desc) ||
751 format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
752 type.floating && type.width == 32 &&
753 (type.length == 1 || (type.length % 4 == 0))) {
754 struct lp_type tmp_type;
755 struct lp_build_context bld;
756 LLVMValueRef packed, rgba[4];
757 const struct util_format_description *flinear_desc;
758 const struct util_format_description *frgba8_desc;
759 unsigned chan;
760
761 lp_build_context_init(&bld, gallivm, type);
762
763 /*
764 * Make sure the conversion in aos really only does convert to rgba8
765 * and not anything more (so use linear format, adjust type).
766 */
767 flinear_desc = util_format_description(util_format_linear(format));
768 memset(&tmp_type, 0, sizeof tmp_type);
769 tmp_type.width = 8;
770 tmp_type.length = type.length * 4;
771 tmp_type.norm = TRUE;
772
773 packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
774 aligned, base_ptr, offset, i, j, cache);
775 packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
776
777 /*
778 * The values are now packed so they match ordinary (srgb) RGBA8 format,
779 * hence need to use matching format for unpack.
780 */
781 frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
782 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
783 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
784 frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
785 }
786 lp_build_unpack_rgba_soa(gallivm,
787 frgba8_desc,
788 type,
789 packed, rgba);
790
791 /*
792 * We converted 4 channels. Make sure llvm can drop unneeded ones
793 * (luckily the rgba order is fixed, only LA needs special case).
794 */
795 for (chan = 0; chan < 4; chan++) {
796 enum pipe_swizzle swizzle = format_desc->swizzle[chan];
797 if (chan == 3 && util_format_is_luminance_alpha(format)) {
798 swizzle = PIPE_SWIZZLE_W;
799 }
800 rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
801 }
802 return;
803 }
804
805
806 /*
807 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
808 *
809 * This is not the most efficient way of fetching pixels, as we
810 * miss some opportunities to do vectorization, but this is
811 * convenient for formats or scenarios for which there was no
812 * opportunity or incentive to optimize.
813 *
814 * We do NOT want to end up here, this typically is quite terrible,
815 * in particular if the formats have less than 4 channels.
816 *
817 * Right now, this should only be hit for:
818 * - RGTC snorm formats
819 * (those miss fast fetch functions hence they are terrible anyway)
820 */
821
822 {
823 unsigned k;
824 struct lp_type tmp_type;
825 LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
826
827 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
828 debug_printf("%s: AoS fetch fallback for %s\n",
829 __FUNCTION__, format_desc->short_name);
830 }
831
832 tmp_type = type;
833 tmp_type.length = 4;
834
835 /*
836 * Note that vector transpose can be worse compared to insert/extract
837 * for aos->soa conversion (for formats with 1 or 2 channels). However,
838 * we should try to avoid getting here for just about all formats, so
839 * don't bother.
840 */
841
842 /* loop over number of pixels */
843 for(k = 0; k < type.length; ++k) {
844 LLVMValueRef index = lp_build_const_int32(gallivm, k);
845 LLVMValueRef offset_elem;
846 LLVMValueRef i_elem, j_elem;
847
848 offset_elem = LLVMBuildExtractElement(builder, offset,
849 index, "");
850
851 i_elem = LLVMBuildExtractElement(builder, i, index, "");
852 j_elem = LLVMBuildExtractElement(builder, j, index, "");
853
854 /* Get a single float[4]={R,G,B,A} pixel */
855 aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
856 aligned, base_ptr, offset_elem,
857 i_elem, j_elem, cache);
858
859 }
860 convert_to_soa(gallivm, aos_fetch, rgba_out, type);
861 }
862 }
863
864 static void
865 lp_build_insert_soa_chan(struct lp_build_context *bld,
866 unsigned blockbits,
867 struct util_format_channel_description chan_desc,
868 LLVMValueRef *output,
869 LLVMValueRef rgba)
870 {
871 struct gallivm_state *gallivm = bld->gallivm;
872 LLVMBuilderRef builder = gallivm->builder;
873 struct lp_type type = bld->type;
874 const unsigned width = chan_desc.size;
875 const unsigned start = chan_desc.shift;
876 const unsigned stop = start + width;
877 LLVMValueRef chan;
878 switch(chan_desc.type) {
879 case UTIL_FORMAT_TYPE_UNSIGNED:
880
881 if (chan_desc.pure_integer)
882 chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
883 else if (type.floating) {
884 if (chan_desc.normalized)
885 chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
886 else
887 chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
888 }
889 if (start)
890 chan = LLVMBuildShl(builder, chan,
891 lp_build_const_int_vec(gallivm, type, start), "");
892 if (!*output)
893 *output = chan;
894 else
895 *output = LLVMBuildOr(builder, *output, chan, "");
896 break;
897 case UTIL_FORMAT_TYPE_SIGNED:
898 if (chan_desc.pure_integer)
899 chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
900 else if (type.floating) {
901 uint32_t mask_val = (1UL << chan_desc.size) - 1;
902 if (chan_desc.normalized) {
903 char intrin[32];
904 double scale = ((1 << (chan_desc.size - 1)) - 1);
905 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
906 rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
907 rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
908 lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
909 rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
910 }
911 chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
912 chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, mask_val), "");
913 }
914 if (start)
915 chan = LLVMBuildShl(builder, chan,
916 lp_build_const_int_vec(gallivm, type, start), "");
917 if (!*output)
918 *output = chan;
919 else
920 *output = LLVMBuildOr(builder, *output, chan, "");
921 break;
922 case UTIL_FORMAT_TYPE_FLOAT:
923 if (type.floating) {
924 if (chan_desc.size == 16) {
925 chan = lp_build_float_to_half(gallivm, rgba);
926 chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
927 if (start)
928 chan = LLVMBuildShl(builder, chan,
929 lp_build_const_int_vec(gallivm, type, start), "");
930 if (!*output)
931 *output = chan;
932 else
933 *output = LLVMBuildOr(builder, *output, chan, "");
934 } else {
935 assert(start == 0);
936 assert(stop == 32);
937 assert(type.width == 32);
938 *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
939 }
940 } else
941 assert(0);
942 break;
943 default:
944 assert(0);
945 *output = bld->undef;
946 }
947 }
948
949 static void
950 lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
951 const struct util_format_description *format_desc,
952 struct lp_type type,
953 const LLVMValueRef rgba_in[4],
954 LLVMValueRef *packed)
955 {
956 unsigned chan;
957 struct lp_build_context bld;
958 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
959 assert(format_desc->block.width == 1);
960 assert(format_desc->block.height == 1);
961 assert(format_desc->block.bits <= type.width);
962 /* FIXME: Support more output types */
963 assert(type.width == 32);
964
965 lp_build_context_init(&bld, gallivm, type);
966 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
967 struct util_format_channel_description chan_desc = format_desc->channel[chan];
968
969 lp_build_insert_soa_chan(&bld, format_desc->block.bits,
970 chan_desc,
971 packed,
972 rgba_in[chan]);
973 }
974 }
975
976 void
977 lp_build_store_rgba_soa(struct gallivm_state *gallivm,
978 const struct util_format_description *format_desc,
979 struct lp_type type,
980 LLVMValueRef exec_mask,
981 LLVMValueRef base_ptr,
982 LLVMValueRef offset,
983 LLVMValueRef out_of_bounds,
984 const LLVMValueRef rgba_in[4])
985 {
986 enum pipe_format format = format_desc->format;
987 LLVMValueRef packed[4];
988 unsigned num_stores;
989
990 memset(packed, 0, sizeof(LLVMValueRef) * 4);
991 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
992 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
993 format_desc->block.width == 1 &&
994 format_desc->block.height == 1 &&
995 format_desc->block.bits <= type.width &&
996 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
997 format_desc->channel[0].size == 32 ||
998 format_desc->channel[0].size == 16))
999 {
1000 lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
1001
1002 num_stores = 1;
1003 } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1004 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
1005 format_desc->block.width == 1 &&
1006 format_desc->block.height == 1 &&
1007 format_desc->block.bits > type.width &&
1008 ((format_desc->block.bits <= type.width * type.length &&
1009 format_desc->channel[0].size <= type.width) ||
1010 (format_desc->channel[0].size == 64 &&
1011 format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1012 type.floating)))
1013 {
1014 /*
1015 * Similar to above, but the packed pixel is larger than what fits
1016 * into an element of the destination format. The packed pixels will be
1017 * shuffled into SoA vectors appropriately, and then the extraction will
1018 * be done in parallel as much as possible.
1019 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
1020 * the gathered vectors can be shuffled easily (even with avx).
1021 * 64xn float -> 32xn float is handled too but it's a bit special as
1022 * it does the conversion pre-shuffle.
1023 */
1024 struct lp_build_context bld;
1025
1026 lp_build_context_init(&bld, gallivm, type);
1027 assert(type.width == 32);
1028 assert(format_desc->block.bits > type.width);
1029
1030 unsigned store_width = util_next_power_of_two(format_desc->block.bits);
1031 num_stores = store_width / type.width;
1032 for (unsigned i = 0; i < format_desc->nr_channels; i++) {
1033 struct util_format_channel_description chan_desc = format_desc->channel[i];
1034 unsigned blockbits = type.width;
1035 unsigned vec_nr;
1036
1037 vec_nr = chan_desc.shift / type.width;
1038 chan_desc.shift %= type.width;
1039
1040 lp_build_insert_soa_chan(&bld, blockbits,
1041 chan_desc,
1042 &packed[vec_nr],
1043 rgba_in[i]);
1044 }
1045
1046 assert(num_stores == 4 || num_stores == 2);
1047 /* we can transpose and store at the same time */
1048 } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1049 packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
1050 num_stores = 1;
1051 } else
1052 assert(0);
1053
1054 assert(exec_mask);
1055
1056 LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
1057 LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
1058 LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
1059
1060 LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
1061 should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
1062 for (unsigned i = 0; i < num_stores; i++) {
1063 struct lp_build_loop_state loop_state;
1064
1065 LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
1066 store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
1067
1068 lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
1069
1070 struct lp_build_if_state ifthen;
1071 LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
1072 lp_build_if(&ifthen, gallivm, cond);
1073
1074 LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
1075 LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
1076
1077 if (format_desc->block.bits == 8) {
1078 this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
1079 data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
1080 } else if (format_desc->block.bits == 16) {
1081 this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
1082 data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
1083 } else
1084 this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
1085 LLVMBuildStore(gallivm->builder, data, this_offset);
1086 lp_build_endif(&ifthen);
1087 lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
1088 NULL, LLVMIntUGE);
1089 }
1090 }