1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 #include "pipe/p_defines.h"
31 #include "util/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 #include "util/u_math.h"
36 #include "lp_bld_type.h"
37 #include "lp_bld_const.h"
38 #include "lp_bld_conv.h"
39 #include "lp_bld_swizzle.h"
40 #include "lp_bld_gather.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_format.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_pack.h"
48 convert_to_soa(struct gallivm_state
*gallivm
,
49 LLVMValueRef src_aos
[LP_MAX_VECTOR_WIDTH
/ 32],
50 LLVMValueRef dst_soa
[4],
51 const struct lp_type soa_type
)
54 struct lp_type aos_channel_type
= soa_type
;
56 LLVMValueRef aos_channels
[4];
57 unsigned pixels_per_channel
= soa_type
.length
/ 4;
59 debug_assert((soa_type
.length
% 4) == 0);
61 aos_channel_type
.length
>>= 1;
63 for (j
= 0; j
< 4; ++j
) {
64 LLVMValueRef channel
[LP_MAX_VECTOR_LENGTH
] = { 0 };
66 assert(pixels_per_channel
<= LP_MAX_VECTOR_LENGTH
);
68 for (k
= 0; k
< pixels_per_channel
; ++k
) {
69 channel
[k
] = src_aos
[j
+ 4 * k
];
72 aos_channels
[j
] = lp_build_concat(gallivm
, channel
, aos_channel_type
, pixels_per_channel
);
75 lp_build_transpose_aos(gallivm
, soa_type
, aos_channels
, dst_soa
);
80 lp_build_format_swizzle_soa(const struct util_format_description
*format_desc
,
81 struct lp_build_context
*bld
,
82 const LLVMValueRef
*unswizzled
,
83 LLVMValueRef swizzled_out
[4])
85 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_ZS
) {
86 enum pipe_swizzle swizzle
;
87 LLVMValueRef depth_or_stencil
;
89 if (util_format_has_stencil(format_desc
) &&
90 !util_format_has_depth(format_desc
)) {
91 assert(!bld
->type
.floating
);
92 swizzle
= format_desc
->swizzle
[1];
95 assert(bld
->type
.floating
);
96 swizzle
= format_desc
->swizzle
[0];
99 * Return zzz1 or sss1 for depth-stencil formats here.
100 * Correct swizzling will be handled by apply_sampler_swizzle() later.
102 depth_or_stencil
= lp_build_swizzle_soa_channel(bld
, unswizzled
, swizzle
);
104 swizzled_out
[2] = swizzled_out
[1] = swizzled_out
[0] = depth_or_stencil
;
105 swizzled_out
[3] = bld
->one
;
109 for (chan
= 0; chan
< 4; ++chan
) {
110 enum pipe_swizzle swizzle
= format_desc
->swizzle
[chan
];
111 swizzled_out
[chan
] = lp_build_swizzle_soa_channel(bld
, unswizzled
, swizzle
);
119 lp_build_extract_soa_chan(struct lp_build_context
*bld
,
122 struct util_format_channel_description chan_desc
,
125 struct gallivm_state
*gallivm
= bld
->gallivm
;
126 LLVMBuilderRef builder
= gallivm
->builder
;
127 struct lp_type type
= bld
->type
;
128 LLVMValueRef input
= packed
;
129 const unsigned width
= chan_desc
.size
;
130 const unsigned start
= chan_desc
.shift
;
131 const unsigned stop
= start
+ width
;
133 /* Decode the input vector component */
135 switch(chan_desc
.type
) {
136 case UTIL_FORMAT_TYPE_VOID
:
140 case UTIL_FORMAT_TYPE_UNSIGNED
:
145 input
= LLVMBuildLShr(builder
, input
,
146 lp_build_const_int_vec(gallivm
, type
, start
), "");
152 if (stop
< blockbits
) {
153 unsigned mask
= ((unsigned long long)1 << width
) - 1;
154 input
= LLVMBuildAnd(builder
, input
,
155 lp_build_const_int_vec(gallivm
, type
, mask
), "");
163 struct lp_type conv_type
= lp_uint_type(type
);
164 input
= lp_build_srgb_to_linear(gallivm
, conv_type
, width
, input
);
167 if(chan_desc
.normalized
)
168 input
= lp_build_unsigned_norm_to_float(gallivm
, width
, type
, input
);
170 input
= LLVMBuildSIToFP(builder
, input
, bld
->vec_type
, "");
173 else if (chan_desc
.pure_integer
) {
181 case UTIL_FORMAT_TYPE_SIGNED
:
183 * Align the sign bit first.
185 if (stop
< type
.width
) {
186 unsigned bits
= type
.width
- stop
;
187 LLVMValueRef bits_val
= lp_build_const_int_vec(gallivm
, type
, bits
);
188 input
= LLVMBuildShl(builder
, input
, bits_val
, "");
192 * Align the LSB (with an arithmetic shift to preserve the sign)
194 if (chan_desc
.size
< type
.width
) {
195 unsigned bits
= type
.width
- chan_desc
.size
;
196 LLVMValueRef bits_val
= lp_build_const_int_vec(gallivm
, type
, bits
);
197 input
= LLVMBuildAShr(builder
, input
, bits_val
, "");
204 input
= LLVMBuildSIToFP(builder
, input
, bld
->vec_type
, "");
205 if (chan_desc
.normalized
) {
206 double scale
= 1.0 / ((1 << (chan_desc
.size
- 1)) - 1);
207 LLVMValueRef scale_val
= lp_build_const_vec(gallivm
, type
, scale
);
208 input
= LLVMBuildFMul(builder
, input
, scale_val
, "");
210 * The formula above will produce value below -1.0 for most negative
211 * value but everything seems happy with that hence disable for now.
214 input
= lp_build_max(bld
, input
,
215 lp_build_const_vec(gallivm
, type
, -1.0f
));
218 else if (chan_desc
.pure_integer
) {
226 case UTIL_FORMAT_TYPE_FLOAT
:
228 if (chan_desc
.size
== 16) {
229 struct lp_type f16i_type
= type
;
230 f16i_type
.width
/= 2;
231 f16i_type
.floating
= 0;
233 input
= LLVMBuildLShr(builder
, input
,
234 lp_build_const_int_vec(gallivm
, type
, start
), "");
236 input
= LLVMBuildTrunc(builder
, input
,
237 lp_build_vec_type(gallivm
, f16i_type
), "");
238 input
= lp_build_half_to_float(gallivm
, input
);
242 assert(type
.width
== 32);
244 input
= LLVMBuildBitCast(builder
, input
, bld
->vec_type
, "");
253 case UTIL_FORMAT_TYPE_FIXED
:
255 double scale
= 1.0 / ((1 << (chan_desc
.size
/2)) - 1);
256 LLVMValueRef scale_val
= lp_build_const_vec(gallivm
, type
, scale
);
257 input
= LLVMBuildSIToFP(builder
, input
, bld
->vec_type
, "");
258 input
= LLVMBuildFMul(builder
, input
, scale_val
, "");
278 * Unpack several pixels in SoA.
280 * It takes a vector of packed pixels:
282 * packed = {P0, P1, P2, P3, ..., Pn}
284 * And will produce four vectors:
286 * red = {R0, R1, R2, R3, ..., Rn}
287 * green = {G0, G1, G2, G3, ..., Gn}
288 * blue = {B0, B1, B2, B3, ..., Bn}
289 * alpha = {A0, A1, A2, A3, ..., An}
291 * It requires that a packed pixel fits into an element of the output
292 * channels. The common case is when converting pixel with a depth of 32 bit or
295 * \param format_desc the format of the 'packed' incoming pixel vector
296 * \param type the desired type for rgba_out (type.length = n, above)
297 * \param packed the incoming vector of packed pixels
298 * \param rgba_out returns the SoA R,G,B,A vectors
301 lp_build_unpack_rgba_soa(struct gallivm_state
*gallivm
,
302 const struct util_format_description
*format_desc
,
305 LLVMValueRef rgba_out
[4])
307 struct lp_build_context bld
;
308 LLVMValueRef inputs
[4];
311 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
);
312 assert(format_desc
->block
.width
== 1);
313 assert(format_desc
->block
.height
== 1);
314 assert(format_desc
->block
.bits
<= type
.width
);
315 /* FIXME: Support more output types */
316 assert(type
.width
== 32);
318 lp_build_context_init(&bld
, gallivm
, type
);
320 /* Decode the input vector components */
321 for (chan
= 0; chan
< format_desc
->nr_channels
; ++chan
) {
322 struct util_format_channel_description chan_desc
= format_desc
->channel
[chan
];
323 boolean srgb_chan
= FALSE
;
325 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_SRGB
&&
326 format_desc
->swizzle
[3] != chan
) {
330 inputs
[chan
] = lp_build_extract_soa_chan(&bld
,
331 format_desc
->block
.bits
,
337 lp_build_format_swizzle_soa(format_desc
, &bld
, inputs
, rgba_out
);
342 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
344 * \param dst_type The desired return type. For pure integer formats
345 * this should be a 32bit wide int or uint vector type,
346 * otherwise a float vector type.
348 * \param packed The rgba8 values to pack.
350 * \param rgba The 4 SoA return vectors.
353 lp_build_rgba8_to_fi32_soa(struct gallivm_state
*gallivm
,
354 struct lp_type dst_type
,
358 LLVMBuilderRef builder
= gallivm
->builder
;
359 LLVMValueRef mask
= lp_build_const_int_vec(gallivm
, dst_type
, 0xff);
362 /* XXX technically shouldn't use that for uint dst_type */
363 packed
= LLVMBuildBitCast(builder
, packed
,
364 lp_build_int_vec_type(gallivm
, dst_type
), "");
366 /* Decode the input vector components */
367 for (chan
= 0; chan
< 4; ++chan
) {
368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
369 unsigned start
= chan
*8;
371 unsigned start
= (3-chan
)*8;
373 unsigned stop
= start
+ 8;
379 input
= LLVMBuildLShr(builder
, input
,
380 lp_build_const_int_vec(gallivm
, dst_type
, start
), "");
383 input
= LLVMBuildAnd(builder
, input
, mask
, "");
385 if (dst_type
.floating
)
386 input
= lp_build_unsigned_norm_to_float(gallivm
, 8, dst_type
, input
);
395 * Fetch a texels from a texture, returning them in SoA layout.
397 * \param type the desired return type for 'rgba'. The vector length
398 * is the number of texels to fetch
399 * \param aligned if the offset is guaranteed to be aligned to element width
401 * \param base_ptr points to the base of the texture mip tree.
402 * \param offset offset to start of the texture image block. For non-
403 * compressed formats, this simply is an offset to the texel.
404 * For compressed formats, it is an offset to the start of the
405 * compressed data block.
407 * \param i, j the sub-block pixel coordinates. For non-compressed formats
408 * these will always be (0,0). For compressed formats, i will
409 * be in [0, block_width-1] and j will be in [0, block_height-1].
410 * \param cache optional value pointing to a lp_build_format_cache structure
413 lp_build_fetch_rgba_soa(struct gallivm_state
*gallivm
,
414 const struct util_format_description
*format_desc
,
417 LLVMValueRef base_ptr
,
422 LLVMValueRef rgba_out
[4])
424 LLVMBuilderRef builder
= gallivm
->builder
;
425 enum pipe_format format
= format_desc
->format
;
426 struct lp_type fetch_type
;
428 if (format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
&&
429 (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_RGB
||
430 format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_SRGB
||
431 format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_ZS
) &&
432 format_desc
->block
.width
== 1 &&
433 format_desc
->block
.height
== 1 &&
434 format_desc
->block
.bits
<= type
.width
&&
435 (format_desc
->channel
[0].type
!= UTIL_FORMAT_TYPE_FLOAT
||
436 format_desc
->channel
[0].size
== 32 ||
437 format_desc
->channel
[0].size
== 16))
440 * The packed pixel fits into an element of the destination format. Put
441 * the packed pixels into a vector and extract each component for all
442 * vector elements in parallel.
448 * gather the texels from the texture
449 * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
451 assert(format_desc
->block
.bits
<= type
.width
);
452 fetch_type
= lp_type_uint(type
.width
);
453 packed
= lp_build_gather(gallivm
,
455 format_desc
->block
.bits
,
458 base_ptr
, offset
, FALSE
);
461 * convert texels to float rgba
463 lp_build_unpack_rgba_soa(gallivm
,
471 if (format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
&&
472 (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_RGB
) &&
473 format_desc
->block
.width
== 1 &&
474 format_desc
->block
.height
== 1 &&
475 format_desc
->block
.bits
> type
.width
&&
476 ((format_desc
->block
.bits
<= type
.width
* type
.length
&&
477 format_desc
->channel
[0].size
<= type
.width
) ||
478 (format_desc
->channel
[0].size
== 64 &&
479 format_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_FLOAT
&&
483 * Similar to above, but the packed pixel is larger than what fits
484 * into an element of the destination format. The packed pixels will be
485 * shuffled into SoA vectors appropriately, and then the extraction will
486 * be done in parallel as much as possible.
487 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
488 * the gathered vectors can be shuffled easily (even with avx).
489 * 64xn float -> 32xn float is handled too but it's a bit special as
490 * it does the conversion pre-shuffle.
493 LLVMValueRef packed
[4], dst
[4], output
[4], shuffles
[LP_MAX_VECTOR_WIDTH
/32];
494 struct lp_type fetch_type
, gather_type
= type
;
495 unsigned num_gather
, fetch_width
, i
, j
;
496 struct lp_build_context bld
;
497 boolean fp64
= format_desc
->channel
[0].size
== 64;
499 lp_build_context_init(&bld
, gallivm
, type
);
501 assert(type
.width
== 32);
502 assert(format_desc
->block
.bits
> type
.width
);
505 * First, figure out fetch order.
507 fetch_width
= util_next_power_of_two(format_desc
->block
.bits
);
509 * fp64 are treated like fp32 except we fetch twice wide values
510 * (as we shuffle after trunc). The shuffles for that work out
511 * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
512 * albeit we miss the potential opportunity for hw gather (as it
513 * only handles native size).
515 num_gather
= fetch_width
/ type
.width
;
516 gather_type
.width
*= num_gather
;
520 gather_type
.length
/= num_gather
;
522 for (i
= 0; i
< num_gather
; i
++) {
523 LLVMValueRef offsetr
, shuf_vec
;
524 if(num_gather
== 4) {
525 for (j
= 0; j
< gather_type
.length
; j
++) {
526 unsigned idx
= i
+ 4*j
;
527 shuffles
[j
] = lp_build_const_int32(gallivm
, idx
);
529 shuf_vec
= LLVMConstVector(shuffles
, gather_type
.length
);
530 offsetr
= LLVMBuildShuffleVector(builder
, offset
, offset
, shuf_vec
, "");
533 else if (num_gather
== 2) {
534 assert(num_gather
== 2);
535 for (j
= 0; j
< gather_type
.length
; j
++) {
536 unsigned idx
= i
*2 + (j
%2) + (j
/2)*4;
537 shuffles
[j
] = lp_build_const_int32(gallivm
, idx
);
539 shuf_vec
= LLVMConstVector(shuffles
, gather_type
.length
);
540 offsetr
= LLVMBuildShuffleVector(builder
, offset
, offset
, shuf_vec
, "");
543 assert(num_gather
== 1);
546 if (gather_type
.length
== 1) {
547 LLVMValueRef zero
= lp_build_const_int32(gallivm
, 0);
548 offsetr
= LLVMBuildExtractElement(builder
, offsetr
, zero
, "");
552 * Determine whether to use float or int loads. This is mostly
553 * to outsmart the (stupid) llvm int/float shuffle logic, we
554 * don't really care much if the data is floats or ints...
555 * But llvm will refuse to use single float shuffle with int data
556 * and instead use 3 int shuffles instead, the code looks atrocious.
557 * (Note bitcasts often won't help, as llvm is too smart to be
559 * Nobody cares about simd float<->int domain transition penalties,
560 * which usually don't even exist for shuffles anyway.
561 * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
562 * going into transpose, which is unpacks, so doesn't really matter
564 * With 2x32bit or 4x16bit fetch, we use float vec, since those
565 * go into the weird channel separation shuffle. With floats,
566 * this is (with 128bit vectors):
567 * - 2 movq, 2 movhpd, 2 shufps
568 * With ints it would be:
569 * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
570 * I've seen texture functions increase in code size by 15% just due
571 * to that (there's lots of such fetches in them...)
572 * (We could chose a different gather order to improve this somewhat
573 * for the int path, but it would basically just drop the blends,
574 * so the float path with this order really is optimal.)
575 * Albeit it is tricky sometimes llvm doesn't ignore the float->int
576 * casts so must avoid them until we're done with the float shuffle...
577 * 3x16bit formats (the same is also true for 3x8) are pretty bad but
578 * there's nothing we can do about them (we could overallocate by
579 * those couple bytes and use unaligned but pot sized load).
580 * Note that this is very much x86 specific. I don't know if this
581 * affect other archs at all.
583 if (num_gather
> 1) {
585 * We always want some float type here (with x86)
586 * due to shuffles being float ones afterwards (albeit for
587 * the num_gather == 4 case int should work fine too
588 * (unless there's some problems with avx but not avx2).
590 if (format_desc
->channel
[0].size
== 64) {
591 fetch_type
= lp_type_float_vec(64, gather_type
.width
);
593 fetch_type
= lp_type_int_vec(32, gather_type
.width
);
597 /* type doesn't matter much */
598 if (format_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_FLOAT
&&
599 (format_desc
->channel
[0].size
== 32 ||
600 format_desc
->channel
[0].size
== 64)) {
601 fetch_type
= lp_type_float(gather_type
.width
);
603 fetch_type
= lp_type_uint(gather_type
.width
);
607 /* Now finally gather the values */
608 packed
[i
] = lp_build_gather(gallivm
, gather_type
.length
,
609 format_desc
->block
.bits
,
611 base_ptr
, offsetr
, FALSE
);
613 struct lp_type conv_type
= type
;
614 conv_type
.width
*= 2;
615 packed
[i
] = LLVMBuildBitCast(builder
, packed
[i
],
616 lp_build_vec_type(gallivm
, conv_type
), "");
617 packed
[i
] = LLVMBuildFPTrunc(builder
, packed
[i
], bld
.vec_type
, "");
621 /* shuffle the gathered values to SoA */
622 if (num_gather
== 2) {
623 for (i
= 0; i
< num_gather
; i
++) {
624 for (j
= 0; j
< type
.length
; j
++) {
625 unsigned idx
= (j
%2)*2 + (j
/4)*4 + i
;
628 shuffles
[j
] = lp_build_const_int32(gallivm
, idx
);
630 dst
[i
] = LLVMBuildShuffleVector(builder
, packed
[0], packed
[1],
631 LLVMConstVector(shuffles
, type
.length
), "");
634 else if (num_gather
== 4) {
635 lp_build_transpose_aos(gallivm
, lp_int_type(type
), packed
, dst
);
638 assert(num_gather
== 1);
643 * And finally unpack exactly as above, except that
644 * chan shift is adjusted and the right vector selected.
647 for (i
= 0; i
< num_gather
; i
++) {
648 dst
[i
] = LLVMBuildBitCast(builder
, dst
[i
], bld
.int_vec_type
, "");
650 for (i
= 0; i
< format_desc
->nr_channels
; i
++) {
651 struct util_format_channel_description chan_desc
= format_desc
->channel
[i
];
652 unsigned blockbits
= type
.width
;
655 #ifdef PIPE_ARCH_BIG_ENDIAN
656 vec_nr
= (format_desc
->block
.bits
- (chan_desc
.shift
+ chan_desc
.size
)) / type
.width
;
658 vec_nr
= chan_desc
.shift
/ type
.width
;
660 chan_desc
.shift
%= type
.width
;
662 output
[i
] = lp_build_extract_soa_chan(&bld
,
670 for (i
= 0; i
< format_desc
->nr_channels
; i
++) {
675 lp_build_format_swizzle_soa(format_desc
, &bld
, output
, rgba_out
);
679 if (format
== PIPE_FORMAT_R11G11B10_FLOAT
||
680 format
== PIPE_FORMAT_R9G9B9E5_FLOAT
) {
682 * similar conceptually to above but requiring special
683 * AoS packed -> SoA float conversion code.
686 struct lp_type fetch_type
= lp_type_uint(type
.width
);
688 assert(type
.floating
);
689 assert(type
.width
== 32);
691 packed
= lp_build_gather(gallivm
, type
.length
,
692 format_desc
->block
.bits
,
694 base_ptr
, offset
, FALSE
);
695 if (format
== PIPE_FORMAT_R11G11B10_FLOAT
) {
696 lp_build_r11g11b10_to_float(gallivm
, packed
, rgba_out
);
699 lp_build_rgb9e5_to_float(gallivm
, packed
, rgba_out
);
704 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_ZS
&&
705 format_desc
->block
.bits
== 64) {
707 * special case the format is 64 bits but we only require
708 * 32bit (or 8bit) from each block.
711 struct lp_type fetch_type
= lp_type_uint(type
.width
);
713 if (format
== PIPE_FORMAT_X32_S8X24_UINT
) {
715 * for stencil simply fix up offsets - could in fact change
716 * base_ptr instead even outside the shader.
718 unsigned mask
= (1 << 8) - 1;
719 LLVMValueRef s_offset
= lp_build_const_int_vec(gallivm
, type
, 4);
720 offset
= LLVMBuildAdd(builder
, offset
, s_offset
, "");
721 packed
= lp_build_gather(gallivm
, type
.length
, 32, fetch_type
,
722 aligned
, base_ptr
, offset
, FALSE
);
723 packed
= LLVMBuildAnd(builder
, packed
,
724 lp_build_const_int_vec(gallivm
, type
, mask
), "");
727 assert (format
== PIPE_FORMAT_Z32_FLOAT_S8X24_UINT
);
728 packed
= lp_build_gather(gallivm
, type
.length
, 32, fetch_type
,
729 aligned
, base_ptr
, offset
, TRUE
);
730 packed
= LLVMBuildBitCast(builder
, packed
,
731 lp_build_vec_type(gallivm
, type
), "");
733 /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
734 rgba_out
[0] = rgba_out
[1] = rgba_out
[2] = packed
;
735 rgba_out
[3] = lp_build_const_vec(gallivm
, type
, 1.0f
);
740 * Try calling lp_build_fetch_rgba_aos for all pixels.
741 * Should only really hit subsampled, compressed
742 * (for s3tc srgb too, for rgtc the unorm ones only) by now.
743 * (This is invalid for plain 8unorm formats because we're lazy with
744 * the swizzle since some results would arrive swizzled, some not.)
747 if ((format_desc
->layout
!= UTIL_FORMAT_LAYOUT_PLAIN
) &&
748 (util_format_fits_8unorm(format_desc
) ||
749 format_desc
->layout
== UTIL_FORMAT_LAYOUT_S3TC
) &&
750 type
.floating
&& type
.width
== 32 &&
751 (type
.length
== 1 || (type
.length
% 4 == 0))) {
752 struct lp_type tmp_type
;
753 struct lp_build_context bld
;
754 LLVMValueRef packed
, rgba
[4];
755 const struct util_format_description
*flinear_desc
;
756 const struct util_format_description
*frgba8_desc
;
759 lp_build_context_init(&bld
, gallivm
, type
);
762 * Make sure the conversion in aos really only does convert to rgba8
763 * and not anything more (so use linear format, adjust type).
765 flinear_desc
= util_format_description(util_format_linear(format
));
766 memset(&tmp_type
, 0, sizeof tmp_type
);
768 tmp_type
.length
= type
.length
* 4;
769 tmp_type
.norm
= TRUE
;
771 packed
= lp_build_fetch_rgba_aos(gallivm
, flinear_desc
, tmp_type
,
772 aligned
, base_ptr
, offset
, i
, j
, cache
);
773 packed
= LLVMBuildBitCast(builder
, packed
, bld
.int_vec_type
, "");
776 * The values are now packed so they match ordinary (srgb) RGBA8 format,
777 * hence need to use matching format for unpack.
779 frgba8_desc
= util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM
);
780 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_SRGB
) {
781 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_S3TC
);
782 frgba8_desc
= util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB
);
784 lp_build_unpack_rgba_soa(gallivm
,
790 * We converted 4 channels. Make sure llvm can drop unneeded ones
791 * (luckily the rgba order is fixed, only LA needs special case).
793 for (chan
= 0; chan
< 4; chan
++) {
794 enum pipe_swizzle swizzle
= format_desc
->swizzle
[chan
];
795 if (chan
== 3 && util_format_is_luminance_alpha(format
)) {
796 swizzle
= PIPE_SWIZZLE_W
;
798 rgba_out
[chan
] = lp_build_swizzle_soa_channel(&bld
, rgba
, swizzle
);
805 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
807 * This is not the most efficient way of fetching pixels, as we
808 * miss some opportunities to do vectorization, but this is
809 * convenient for formats or scenarios for which there was no
810 * opportunity or incentive to optimize.
812 * We do NOT want to end up here, this typically is quite terrible,
813 * in particular if the formats have less than 4 channels.
815 * Right now, this should only be hit for:
816 * - RGTC snorm formats
817 * (those miss fast fetch functions hence they are terrible anyway)
822 struct lp_type tmp_type
;
823 LLVMValueRef aos_fetch
[LP_MAX_VECTOR_WIDTH
/ 32];
825 if (gallivm_debug
& GALLIVM_DEBUG_PERF
) {
826 debug_printf("%s: AoS fetch fallback for %s\n",
827 __FUNCTION__
, format_desc
->short_name
);
834 * Note that vector transpose can be worse compared to insert/extract
835 * for aos->soa conversion (for formats with 1 or 2 channels). However,
836 * we should try to avoid getting here for just about all formats, so
840 /* loop over number of pixels */
841 for(k
= 0; k
< type
.length
; ++k
) {
842 LLVMValueRef index
= lp_build_const_int32(gallivm
, k
);
843 LLVMValueRef offset_elem
;
844 LLVMValueRef i_elem
, j_elem
;
846 offset_elem
= LLVMBuildExtractElement(builder
, offset
,
849 i_elem
= LLVMBuildExtractElement(builder
, i
, index
, "");
850 j_elem
= LLVMBuildExtractElement(builder
, j
, index
, "");
852 /* Get a single float[4]={R,G,B,A} pixel */
853 aos_fetch
[k
] = lp_build_fetch_rgba_aos(gallivm
, format_desc
, tmp_type
,
854 aligned
, base_ptr
, offset_elem
,
855 i_elem
, j_elem
, cache
);
858 convert_to_soa(gallivm
, aos_fetch
, rgba_out
, type
);