gallivm: optimize SoA AoS fallback fetch path a little
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_soa.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_defines.h"
30
31 #include "util/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34
35 #include "lp_bld_type.h"
36 #include "lp_bld_const.h"
37 #include "lp_bld_conv.h"
38 #include "lp_bld_swizzle.h"
39 #include "lp_bld_gather.h"
40 #include "lp_bld_debug.h"
41 #include "lp_bld_format.h"
42 #include "lp_bld_arit.h"
43 #include "lp_bld_pack.h"
44
45
46 static void
47 convert_to_soa(struct gallivm_state *gallivm,
48 LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
49 LLVMValueRef dst_soa[4],
50 const struct lp_type soa_type)
51 {
52 unsigned j, k;
53 struct lp_type aos_channel_type = soa_type;
54
55 LLVMValueRef aos_channels[4];
56 unsigned pixels_per_channel = soa_type.length / 4;
57
58 debug_assert((soa_type.length % 4) == 0);
59
60 aos_channel_type.length >>= 1;
61
62 for (j = 0; j < 4; ++j) {
63 LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
64
65 assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
66
67 for (k = 0; k < pixels_per_channel; ++k) {
68 channel[k] = src_aos[j + 4 * k];
69 }
70
71 aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
72 }
73
74 lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
75 }
76
77
78 void
79 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
80 struct lp_build_context *bld,
81 const LLVMValueRef *unswizzled,
82 LLVMValueRef swizzled_out[4])
83 {
84 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
85 enum pipe_swizzle swizzle;
86 LLVMValueRef depth_or_stencil;
87
88 if (util_format_has_stencil(format_desc) &&
89 !util_format_has_depth(format_desc)) {
90 assert(!bld->type.floating);
91 swizzle = format_desc->swizzle[1];
92 }
93 else {
94 assert(bld->type.floating);
95 swizzle = format_desc->swizzle[0];
96 }
97 /*
98 * Return zzz1 or sss1 for depth-stencil formats here.
99 * Correct swizzling will be handled by apply_sampler_swizzle() later.
100 */
101 depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
102
103 swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
104 swizzled_out[3] = bld->one;
105 }
106 else {
107 unsigned chan;
108 for (chan = 0; chan < 4; ++chan) {
109 enum pipe_swizzle swizzle = format_desc->swizzle[chan];
110 swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
111 }
112 }
113 }
114
115
116 /**
117 * Unpack several pixels in SoA.
118 *
119 * It takes a vector of packed pixels:
120 *
121 * packed = {P0, P1, P2, P3, ..., Pn}
122 *
123 * And will produce four vectors:
124 *
125 * red = {R0, R1, R2, R3, ..., Rn}
126 * green = {G0, G1, G2, G3, ..., Gn}
127 * blue = {B0, B1, B2, B3, ..., Bn}
128 * alpha = {A0, A1, A2, A3, ..., An}
129 *
130 * It requires that a packed pixel fits into an element of the output
131 * channels. The common case is when converting pixel with a depth of 32 bit or
132 * less into floats.
133 *
134 * \param format_desc the format of the 'packed' incoming pixel vector
135 * \param type the desired type for rgba_out (type.length = n, above)
136 * \param packed the incoming vector of packed pixels
137 * \param rgba_out returns the SoA R,G,B,A vectors
138 */
139 void
140 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
141 const struct util_format_description *format_desc,
142 struct lp_type type,
143 LLVMValueRef packed,
144 LLVMValueRef rgba_out[4])
145 {
146 LLVMBuilderRef builder = gallivm->builder;
147 struct lp_build_context bld;
148 LLVMValueRef inputs[4];
149 unsigned chan;
150
151 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
152 assert(format_desc->block.width == 1);
153 assert(format_desc->block.height == 1);
154 assert(format_desc->block.bits <= type.width);
155 /* FIXME: Support more output types */
156 assert(type.width == 32);
157
158 lp_build_context_init(&bld, gallivm, type);
159
160 /* Decode the input vector components */
161 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
162 const unsigned width = format_desc->channel[chan].size;
163 const unsigned start = format_desc->channel[chan].shift;
164 const unsigned stop = start + width;
165 LLVMValueRef input;
166
167 input = packed;
168
169 switch(format_desc->channel[chan].type) {
170 case UTIL_FORMAT_TYPE_VOID:
171 input = lp_build_undef(gallivm, type);
172 break;
173
174 case UTIL_FORMAT_TYPE_UNSIGNED:
175 /*
176 * Align the LSB
177 */
178
179 if (start) {
180 input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
181 }
182
183 /*
184 * Zero the MSBs
185 */
186
187 if (stop < format_desc->block.bits) {
188 unsigned mask = ((unsigned long long)1 << width) - 1;
189 input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
190 }
191
192 /*
193 * Type conversion
194 */
195
196 if (type.floating) {
197 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
198 if (format_desc->swizzle[3] == chan) {
199 input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
200 }
201 else {
202 struct lp_type conv_type = lp_uint_type(type);
203 input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
204 }
205 }
206 else {
207 if(format_desc->channel[chan].normalized)
208 input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
209 else
210 input = LLVMBuildSIToFP(builder, input,
211 lp_build_vec_type(gallivm, type), "");
212 }
213 }
214 else if (format_desc->channel[chan].pure_integer) {
215 /* Nothing to do */
216 } else {
217 /* FIXME */
218 assert(0);
219 }
220
221 break;
222
223 case UTIL_FORMAT_TYPE_SIGNED:
224 /*
225 * Align the sign bit first.
226 */
227
228 if (stop < type.width) {
229 unsigned bits = type.width - stop;
230 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
231 input = LLVMBuildShl(builder, input, bits_val, "");
232 }
233
234 /*
235 * Align the LSB (with an arithmetic shift to preserve the sign)
236 */
237
238 if (format_desc->channel[chan].size < type.width) {
239 unsigned bits = type.width - format_desc->channel[chan].size;
240 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
241 input = LLVMBuildAShr(builder, input, bits_val, "");
242 }
243
244 /*
245 * Type conversion
246 */
247
248 if (type.floating) {
249 input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
250 if (format_desc->channel[chan].normalized) {
251 double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
252 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
253 input = LLVMBuildFMul(builder, input, scale_val, "");
254 /* the formula above will produce value below -1.0 for most negative
255 * value but everything seems happy with that hence disable for now */
256 if (0)
257 input = lp_build_max(&bld, input,
258 lp_build_const_vec(gallivm, type, -1.0f));
259 }
260 }
261 else if (format_desc->channel[chan].pure_integer) {
262 /* Nothing to do */
263 } else {
264 /* FIXME */
265 assert(0);
266 }
267
268 break;
269
270 case UTIL_FORMAT_TYPE_FLOAT:
271 if (type.floating) {
272 if (format_desc->channel[chan].size == 16) {
273 struct lp_type f16i_type = type;
274 f16i_type.width /= 2;
275 f16i_type.floating = 0;
276 if (start) {
277 input = LLVMBuildLShr(builder, input,
278 lp_build_const_int_vec(gallivm, type, start), "");
279 }
280 input = LLVMBuildTrunc(builder, input,
281 lp_build_vec_type(gallivm, f16i_type), "");
282 input = lp_build_half_to_float(gallivm, input);
283 } else {
284 assert(start == 0);
285 assert(stop == 32);
286 assert(type.width == 32);
287 }
288 input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
289 }
290 else {
291 /* FIXME */
292 assert(0);
293 input = lp_build_undef(gallivm, type);
294 }
295 break;
296
297 case UTIL_FORMAT_TYPE_FIXED:
298 if (type.floating) {
299 double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
300 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
301 input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
302 input = LLVMBuildFMul(builder, input, scale_val, "");
303 }
304 else {
305 /* FIXME */
306 assert(0);
307 input = lp_build_undef(gallivm, type);
308 }
309 break;
310
311 default:
312 assert(0);
313 input = lp_build_undef(gallivm, type);
314 break;
315 }
316
317 inputs[chan] = input;
318 }
319
320 lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
321 }
322
323
324 /**
325 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
326 *
327 * \param dst_type The desired return type. For pure integer formats
328 * this should be a 32bit wide int or uint vector type,
329 * otherwise a float vector type.
330 *
331 * \param packed The rgba8 values to pack.
332 *
333 * \param rgba The 4 SoA return vectors.
334 */
335 void
336 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
337 struct lp_type dst_type,
338 LLVMValueRef packed,
339 LLVMValueRef *rgba)
340 {
341 LLVMBuilderRef builder = gallivm->builder;
342 LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
343 unsigned chan;
344
345 /* XXX technically shouldn't use that for uint dst_type */
346 packed = LLVMBuildBitCast(builder, packed,
347 lp_build_int_vec_type(gallivm, dst_type), "");
348
349 /* Decode the input vector components */
350 for (chan = 0; chan < 4; ++chan) {
351 #ifdef PIPE_ARCH_LITTLE_ENDIAN
352 unsigned start = chan*8;
353 #else
354 unsigned start = (3-chan)*8;
355 #endif
356 unsigned stop = start + 8;
357 LLVMValueRef input;
358
359 input = packed;
360
361 if (start)
362 input = LLVMBuildLShr(builder, input,
363 lp_build_const_int_vec(gallivm, dst_type, start), "");
364
365 if (stop < 32)
366 input = LLVMBuildAnd(builder, input, mask, "");
367
368 if (dst_type.floating)
369 input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
370
371 rgba[chan] = input;
372 }
373 }
374
375
376
377 /**
378 * Fetch a texels from a texture, returning them in SoA layout.
379 *
380 * \param type the desired return type for 'rgba'. The vector length
381 * is the number of texels to fetch
382 * \param aligned if the offset is guaranteed to be aligned to element width
383 *
384 * \param base_ptr points to the base of the texture mip tree.
385 * \param offset offset to start of the texture image block. For non-
386 * compressed formats, this simply is an offset to the texel.
387 * For compressed formats, it is an offset to the start of the
388 * compressed data block.
389 *
390 * \param i, j the sub-block pixel coordinates. For non-compressed formats
391 * these will always be (0,0). For compressed formats, i will
392 * be in [0, block_width-1] and j will be in [0, block_height-1].
393 * \param cache optional value pointing to a lp_build_format_cache structure
394 */
395 void
396 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
397 const struct util_format_description *format_desc,
398 struct lp_type type,
399 boolean aligned,
400 LLVMValueRef base_ptr,
401 LLVMValueRef offset,
402 LLVMValueRef i,
403 LLVMValueRef j,
404 LLVMValueRef cache,
405 LLVMValueRef rgba_out[4])
406 {
407 LLVMBuilderRef builder = gallivm->builder;
408
409 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
410 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
411 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
412 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
413 format_desc->block.width == 1 &&
414 format_desc->block.height == 1 &&
415 format_desc->block.bits <= type.width &&
416 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
417 format_desc->channel[0].size == 32 ||
418 format_desc->channel[0].size == 16))
419 {
420 /*
421 * The packed pixel fits into an element of the destination format. Put
422 * the packed pixels into a vector and extract each component for all
423 * vector elements in parallel.
424 */
425
426 LLVMValueRef packed;
427
428 /*
429 * gather the texels from the texture
430 * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
431 */
432 assert(format_desc->block.bits <= type.width);
433 packed = lp_build_gather(gallivm,
434 type.length,
435 format_desc->block.bits,
436 type.width,
437 aligned,
438 base_ptr, offset, FALSE);
439
440 /*
441 * convert texels to float rgba
442 */
443 lp_build_unpack_rgba_soa(gallivm,
444 format_desc,
445 type,
446 packed, rgba_out);
447 return;
448 }
449
450 if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
451 format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
452 /*
453 * similar conceptually to above but requiring special
454 * AoS packed -> SoA float conversion code.
455 */
456 LLVMValueRef packed;
457
458 assert(type.floating);
459 assert(type.width == 32);
460
461 packed = lp_build_gather(gallivm, type.length,
462 format_desc->block.bits,
463 type.width, aligned,
464 base_ptr, offset, FALSE);
465 if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
466 lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
467 }
468 else {
469 lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
470 }
471 return;
472 }
473
474 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
475 format_desc->block.bits == 64) {
476 /*
477 * special case the format is 64 bits but we only require
478 * 32bit (or 8bit) from each block.
479 */
480 LLVMValueRef packed;
481
482 if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
483 /*
484 * for stencil simply fix up offsets - could in fact change
485 * base_ptr instead even outside the shader.
486 */
487 unsigned mask = (1 << 8) - 1;
488 LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
489 offset = LLVMBuildAdd(builder, offset, s_offset, "");
490 packed = lp_build_gather(gallivm, type.length, 32, type.width,
491 aligned, base_ptr, offset, FALSE);
492 packed = LLVMBuildAnd(builder, packed,
493 lp_build_const_int_vec(gallivm, type, mask), "");
494 }
495 else {
496 assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
497 packed = lp_build_gather(gallivm, type.length, 32, type.width,
498 aligned, base_ptr, offset, TRUE);
499 packed = LLVMBuildBitCast(builder, packed,
500 lp_build_vec_type(gallivm, type), "");
501 }
502 /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
503 rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
504 rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
505 return;
506 }
507
508 /*
509 * Try calling lp_build_fetch_rgba_aos for all pixels.
510 */
511
512 if (util_format_fits_8unorm(format_desc) &&
513 type.floating && type.width == 32 &&
514 (type.length == 1 || (type.length % 4 == 0))) {
515 struct lp_type tmp_type;
516 LLVMValueRef tmp;
517
518 memset(&tmp_type, 0, sizeof tmp_type);
519 tmp_type.width = 8;
520 tmp_type.length = type.length * 4;
521 tmp_type.norm = TRUE;
522
523 tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
524 aligned, base_ptr, offset, i, j, cache);
525
526 lp_build_rgba8_to_fi32_soa(gallivm,
527 type,
528 tmp,
529 rgba_out);
530
531 return;
532 }
533
534 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
535 /* non-srgb case is already handled above */
536 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
537 type.floating && type.width == 32 &&
538 (type.length == 1 || (type.length % 4 == 0)) &&
539 cache) {
540 const struct util_format_description *format_decompressed;
541 const struct util_format_description *flinear_desc;
542 LLVMValueRef packed;
543 flinear_desc = util_format_description(util_format_linear(format_desc->format));
544 /* This probably only works with aligned data */
545 packed = lp_build_fetch_cached_texels(gallivm,
546 flinear_desc,
547 type.length,
548 base_ptr,
549 offset,
550 i, j,
551 cache);
552 packed = LLVMBuildBitCast(builder, packed,
553 lp_build_int_vec_type(gallivm, type), "");
554 /*
555 * The values are now packed so they match ordinary srgb RGBA8 format,
556 * hence need to use matching format for unpack.
557 */
558 format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
559
560 lp_build_unpack_rgba_soa(gallivm,
561 format_decompressed,
562 type,
563 packed, rgba_out);
564
565 return;
566 }
567
568 /*
569 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
570 *
571 * This is not the most efficient way of fetching pixels, as we
572 * miss some opportunities to do vectorization, but this is
573 * convenient for formats or scenarios for which there was no
574 * opportunity or incentive to optimize.
575 */
576
577 {
578 unsigned k;
579 struct lp_type tmp_type;
580 LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
581
582 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
583 debug_printf("%s: AoS fetch fallback for %s\n",
584 __FUNCTION__, format_desc->short_name);
585 }
586
587 tmp_type = type;
588 tmp_type.length = 4;
589
590 /*
591 * Note that vector transpose can be worse compared to insert/extract
592 * for aos->soa conversion (for formats with 1 or 2 channels). However,
593 * we should try to avoid getting here for just about all formats, so
594 * don't bother.
595 */
596
597 /* loop over number of pixels */
598 for(k = 0; k < type.length; ++k) {
599 LLVMValueRef index = lp_build_const_int32(gallivm, k);
600 LLVMValueRef offset_elem;
601 LLVMValueRef i_elem, j_elem;
602
603 offset_elem = LLVMBuildExtractElement(builder, offset,
604 index, "");
605
606 i_elem = LLVMBuildExtractElement(builder, i, index, "");
607 j_elem = LLVMBuildExtractElement(builder, j, index, "");
608
609 /* Get a single float[4]={R,G,B,A} pixel */
610 aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
611 aligned, base_ptr, offset_elem,
612 i_elem, j_elem, cache);
613
614 }
615 convert_to_soa(gallivm, aos_fetch, rgba_out, type);
616 }
617 }