1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
29 #include "pipe/p_defines.h"
31 #include "util/format/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 #include "util/u_math.h"
36 #include "lp_bld_type.h"
37 #include "lp_bld_const.h"
38 #include "lp_bld_conv.h"
39 #include "lp_bld_swizzle.h"
40 #include "lp_bld_gather.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_format.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_pack.h"
45 #include "lp_bld_flow.h"
46 #include "lp_bld_printf.h"
47 #include "lp_bld_intr.h"
50 convert_to_soa(struct gallivm_state
*gallivm
,
51 LLVMValueRef src_aos
[LP_MAX_VECTOR_WIDTH
/ 32],
52 LLVMValueRef dst_soa
[4],
53 const struct lp_type soa_type
)
56 struct lp_type aos_channel_type
= soa_type
;
58 LLVMValueRef aos_channels
[4];
59 unsigned pixels_per_channel
= soa_type
.length
/ 4;
61 debug_assert((soa_type
.length
% 4) == 0);
63 aos_channel_type
.length
>>= 1;
65 for (j
= 0; j
< 4; ++j
) {
66 LLVMValueRef channel
[LP_MAX_VECTOR_LENGTH
] = { 0 };
68 assert(pixels_per_channel
<= LP_MAX_VECTOR_LENGTH
);
70 for (k
= 0; k
< pixels_per_channel
; ++k
) {
71 channel
[k
] = src_aos
[j
+ 4 * k
];
74 aos_channels
[j
] = lp_build_concat(gallivm
, channel
, aos_channel_type
, pixels_per_channel
);
77 lp_build_transpose_aos(gallivm
, soa_type
, aos_channels
, dst_soa
);
82 lp_build_format_swizzle_soa(const struct util_format_description
*format_desc
,
83 struct lp_build_context
*bld
,
84 const LLVMValueRef
*unswizzled
,
85 LLVMValueRef swizzled_out
[4])
87 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_ZS
) {
88 enum pipe_swizzle swizzle
;
89 LLVMValueRef depth_or_stencil
;
91 if (util_format_has_stencil(format_desc
) &&
92 !util_format_has_depth(format_desc
)) {
93 assert(!bld
->type
.floating
);
94 swizzle
= format_desc
->swizzle
[1];
97 assert(bld
->type
.floating
);
98 swizzle
= format_desc
->swizzle
[0];
101 * Return zzz1 or sss1 for depth-stencil formats here.
102 * Correct swizzling will be handled by apply_sampler_swizzle() later.
104 depth_or_stencil
= lp_build_swizzle_soa_channel(bld
, unswizzled
, swizzle
);
106 swizzled_out
[2] = swizzled_out
[1] = swizzled_out
[0] = depth_or_stencil
;
107 swizzled_out
[3] = bld
->one
;
111 for (chan
= 0; chan
< 4; ++chan
) {
112 enum pipe_swizzle swizzle
= format_desc
->swizzle
[chan
];
113 swizzled_out
[chan
] = lp_build_swizzle_soa_channel(bld
, unswizzled
, swizzle
);
121 lp_build_extract_soa_chan(struct lp_build_context
*bld
,
124 struct util_format_channel_description chan_desc
,
127 struct gallivm_state
*gallivm
= bld
->gallivm
;
128 LLVMBuilderRef builder
= gallivm
->builder
;
129 struct lp_type type
= bld
->type
;
130 LLVMValueRef input
= packed
;
131 const unsigned width
= chan_desc
.size
;
132 const unsigned start
= chan_desc
.shift
;
133 const unsigned stop
= start
+ width
;
135 /* Decode the input vector component */
137 switch(chan_desc
.type
) {
138 case UTIL_FORMAT_TYPE_VOID
:
142 case UTIL_FORMAT_TYPE_UNSIGNED
:
147 input
= LLVMBuildLShr(builder
, input
,
148 lp_build_const_int_vec(gallivm
, type
, start
), "");
154 if (stop
< blockbits
) {
155 unsigned mask
= ((unsigned long long)1 << width
) - 1;
156 input
= LLVMBuildAnd(builder
, input
,
157 lp_build_const_int_vec(gallivm
, type
, mask
), "");
165 struct lp_type conv_type
= lp_uint_type(type
);
166 input
= lp_build_srgb_to_linear(gallivm
, conv_type
, width
, input
);
169 if(chan_desc
.normalized
)
170 input
= lp_build_unsigned_norm_to_float(gallivm
, width
, type
, input
);
172 input
= LLVMBuildUIToFP(builder
, input
, bld
->vec_type
, "");
175 else if (chan_desc
.pure_integer
) {
183 case UTIL_FORMAT_TYPE_SIGNED
:
185 * Align the sign bit first.
187 if (stop
< type
.width
) {
188 unsigned bits
= type
.width
- stop
;
189 LLVMValueRef bits_val
= lp_build_const_int_vec(gallivm
, type
, bits
);
190 input
= LLVMBuildShl(builder
, input
, bits_val
, "");
194 * Align the LSB (with an arithmetic shift to preserve the sign)
196 if (chan_desc
.size
< type
.width
) {
197 unsigned bits
= type
.width
- chan_desc
.size
;
198 LLVMValueRef bits_val
= lp_build_const_int_vec(gallivm
, type
, bits
);
199 input
= LLVMBuildAShr(builder
, input
, bits_val
, "");
206 input
= LLVMBuildSIToFP(builder
, input
, bld
->vec_type
, "");
207 if (chan_desc
.normalized
) {
208 double scale
= 1.0 / ((1 << (chan_desc
.size
- 1)) - 1);
209 LLVMValueRef scale_val
= lp_build_const_vec(gallivm
, type
, scale
);
210 input
= LLVMBuildFMul(builder
, input
, scale_val
, "");
212 * The formula above will produce value below -1.0 for most negative
213 * value but everything seems happy with that hence disable for now.
216 input
= lp_build_max(bld
, input
,
217 lp_build_const_vec(gallivm
, type
, -1.0f
));
220 else if (chan_desc
.pure_integer
) {
228 case UTIL_FORMAT_TYPE_FLOAT
:
230 if (chan_desc
.size
== 16) {
231 struct lp_type f16i_type
= type
;
232 f16i_type
.width
/= 2;
233 f16i_type
.floating
= 0;
235 input
= LLVMBuildLShr(builder
, input
,
236 lp_build_const_int_vec(gallivm
, type
, start
), "");
238 input
= LLVMBuildTrunc(builder
, input
,
239 lp_build_vec_type(gallivm
, f16i_type
), "");
240 input
= lp_build_half_to_float(gallivm
, input
);
244 assert(type
.width
== 32);
246 input
= LLVMBuildBitCast(builder
, input
, bld
->vec_type
, "");
255 case UTIL_FORMAT_TYPE_FIXED
:
257 double scale
= 1.0 / ((1 << (chan_desc
.size
/2)) - 1);
258 LLVMValueRef scale_val
= lp_build_const_vec(gallivm
, type
, scale
);
259 input
= LLVMBuildSIToFP(builder
, input
, bld
->vec_type
, "");
260 input
= LLVMBuildFMul(builder
, input
, scale_val
, "");
280 * Unpack several pixels in SoA.
282 * It takes a vector of packed pixels:
284 * packed = {P0, P1, P2, P3, ..., Pn}
286 * And will produce four vectors:
288 * red = {R0, R1, R2, R3, ..., Rn}
289 * green = {G0, G1, G2, G3, ..., Gn}
290 * blue = {B0, B1, B2, B3, ..., Bn}
291 * alpha = {A0, A1, A2, A3, ..., An}
293 * It requires that a packed pixel fits into an element of the output
294 * channels. The common case is when converting pixel with a depth of 32 bit or
297 * \param format_desc the format of the 'packed' incoming pixel vector
298 * \param type the desired type for rgba_out (type.length = n, above)
299 * \param packed the incoming vector of packed pixels
300 * \param rgba_out returns the SoA R,G,B,A vectors
303 lp_build_unpack_rgba_soa(struct gallivm_state
*gallivm
,
304 const struct util_format_description
*format_desc
,
307 LLVMValueRef rgba_out
[4])
309 struct lp_build_context bld
;
310 LLVMValueRef inputs
[4];
313 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
);
314 assert(format_desc
->block
.width
== 1);
315 assert(format_desc
->block
.height
== 1);
316 assert(format_desc
->block
.bits
<= type
.width
);
317 /* FIXME: Support more output types */
318 assert(type
.width
== 32);
320 lp_build_context_init(&bld
, gallivm
, type
);
322 /* Decode the input vector components */
323 for (chan
= 0; chan
< format_desc
->nr_channels
; ++chan
) {
324 struct util_format_channel_description chan_desc
= format_desc
->channel
[chan
];
325 boolean srgb_chan
= FALSE
;
327 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_SRGB
&&
328 format_desc
->swizzle
[3] != chan
) {
332 inputs
[chan
] = lp_build_extract_soa_chan(&bld
,
333 format_desc
->block
.bits
,
339 lp_build_format_swizzle_soa(format_desc
, &bld
, inputs
, rgba_out
);
344 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
346 * \param dst_type The desired return type. For pure integer formats
347 * this should be a 32bit wide int or uint vector type,
348 * otherwise a float vector type.
350 * \param packed The rgba8 values to pack.
352 * \param rgba The 4 SoA return vectors.
355 lp_build_rgba8_to_fi32_soa(struct gallivm_state
*gallivm
,
356 struct lp_type dst_type
,
360 LLVMBuilderRef builder
= gallivm
->builder
;
361 LLVMValueRef mask
= lp_build_const_int_vec(gallivm
, dst_type
, 0xff);
364 /* XXX technically shouldn't use that for uint dst_type */
365 packed
= LLVMBuildBitCast(builder
, packed
,
366 lp_build_int_vec_type(gallivm
, dst_type
), "");
368 /* Decode the input vector components */
369 for (chan
= 0; chan
< 4; ++chan
) {
370 #if UTIL_ARCH_LITTLE_ENDIAN
371 unsigned start
= chan
*8;
373 unsigned start
= (3-chan
)*8;
375 unsigned stop
= start
+ 8;
381 input
= LLVMBuildLShr(builder
, input
,
382 lp_build_const_int_vec(gallivm
, dst_type
, start
), "");
385 input
= LLVMBuildAnd(builder
, input
, mask
, "");
387 if (dst_type
.floating
)
388 input
= lp_build_unsigned_norm_to_float(gallivm
, 8, dst_type
, input
);
397 * Fetch a texels from a texture, returning them in SoA layout.
399 * \param type the desired return type for 'rgba'. The vector length
400 * is the number of texels to fetch
401 * \param aligned if the offset is guaranteed to be aligned to element width
403 * \param base_ptr points to the base of the texture mip tree.
404 * \param offset offset to start of the texture image block. For non-
405 * compressed formats, this simply is an offset to the texel.
406 * For compressed formats, it is an offset to the start of the
407 * compressed data block.
409 * \param i, j the sub-block pixel coordinates. For non-compressed formats
410 * these will always be (0,0). For compressed formats, i will
411 * be in [0, block_width-1] and j will be in [0, block_height-1].
412 * \param cache optional value pointing to a lp_build_format_cache structure
415 lp_build_fetch_rgba_soa(struct gallivm_state
*gallivm
,
416 const struct util_format_description
*format_desc
,
419 LLVMValueRef base_ptr
,
424 LLVMValueRef rgba_out
[4])
426 LLVMBuilderRef builder
= gallivm
->builder
;
427 enum pipe_format format
= format_desc
->format
;
428 struct lp_type fetch_type
;
430 if (format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
&&
431 (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_RGB
||
432 format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_SRGB
||
433 format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_ZS
) &&
434 format_desc
->block
.width
== 1 &&
435 format_desc
->block
.height
== 1 &&
436 format_desc
->block
.bits
<= type
.width
&&
437 (format_desc
->channel
[0].type
!= UTIL_FORMAT_TYPE_FLOAT
||
438 format_desc
->channel
[0].size
== 32 ||
439 format_desc
->channel
[0].size
== 16))
442 * The packed pixel fits into an element of the destination format. Put
443 * the packed pixels into a vector and extract each component for all
444 * vector elements in parallel.
450 * gather the texels from the texture
451 * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
453 assert(format_desc
->block
.bits
<= type
.width
);
454 fetch_type
= lp_type_uint(type
.width
);
455 packed
= lp_build_gather(gallivm
,
457 format_desc
->block
.bits
,
460 base_ptr
, offset
, FALSE
);
463 * convert texels to float rgba
465 lp_build_unpack_rgba_soa(gallivm
,
473 if (format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
&&
474 (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_RGB
) &&
475 format_desc
->block
.width
== 1 &&
476 format_desc
->block
.height
== 1 &&
477 format_desc
->block
.bits
> type
.width
&&
478 ((format_desc
->block
.bits
<= type
.width
* type
.length
&&
479 format_desc
->channel
[0].size
<= type
.width
) ||
480 (format_desc
->channel
[0].size
== 64 &&
481 format_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_FLOAT
&&
485 * Similar to above, but the packed pixel is larger than what fits
486 * into an element of the destination format. The packed pixels will be
487 * shuffled into SoA vectors appropriately, and then the extraction will
488 * be done in parallel as much as possible.
489 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
490 * the gathered vectors can be shuffled easily (even with avx).
491 * 64xn float -> 32xn float is handled too but it's a bit special as
492 * it does the conversion pre-shuffle.
495 LLVMValueRef packed
[4], dst
[4], output
[4], shuffles
[LP_MAX_VECTOR_WIDTH
/32];
496 struct lp_type fetch_type
, gather_type
= type
;
497 unsigned num_gather
, fetch_width
, i
, j
;
498 struct lp_build_context bld
;
499 boolean fp64
= format_desc
->channel
[0].size
== 64;
501 lp_build_context_init(&bld
, gallivm
, type
);
503 assert(type
.width
== 32);
504 assert(format_desc
->block
.bits
> type
.width
);
507 * First, figure out fetch order.
509 fetch_width
= util_next_power_of_two(format_desc
->block
.bits
);
511 * fp64 are treated like fp32 except we fetch twice wide values
512 * (as we shuffle after trunc). The shuffles for that work out
513 * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
514 * albeit we miss the potential opportunity for hw gather (as it
515 * only handles native size).
517 num_gather
= fetch_width
/ type
.width
;
518 gather_type
.width
*= num_gather
;
522 gather_type
.length
/= num_gather
;
524 for (i
= 0; i
< num_gather
; i
++) {
525 LLVMValueRef offsetr
, shuf_vec
;
526 if(num_gather
== 4) {
527 for (j
= 0; j
< gather_type
.length
; j
++) {
528 unsigned idx
= i
+ 4*j
;
529 shuffles
[j
] = lp_build_const_int32(gallivm
, idx
);
531 shuf_vec
= LLVMConstVector(shuffles
, gather_type
.length
);
532 offsetr
= LLVMBuildShuffleVector(builder
, offset
, offset
, shuf_vec
, "");
535 else if (num_gather
== 2) {
536 assert(num_gather
== 2);
537 for (j
= 0; j
< gather_type
.length
; j
++) {
538 unsigned idx
= i
*2 + (j
%2) + (j
/2)*4;
539 shuffles
[j
] = lp_build_const_int32(gallivm
, idx
);
541 shuf_vec
= LLVMConstVector(shuffles
, gather_type
.length
);
542 offsetr
= LLVMBuildShuffleVector(builder
, offset
, offset
, shuf_vec
, "");
545 assert(num_gather
== 1);
548 if (gather_type
.length
== 1) {
549 LLVMValueRef zero
= lp_build_const_int32(gallivm
, 0);
550 offsetr
= LLVMBuildExtractElement(builder
, offsetr
, zero
, "");
554 * Determine whether to use float or int loads. This is mostly
555 * to outsmart the (stupid) llvm int/float shuffle logic, we
556 * don't really care much if the data is floats or ints...
557 * But llvm will refuse to use single float shuffle with int data
558 * and instead use 3 int shuffles instead, the code looks atrocious.
559 * (Note bitcasts often won't help, as llvm is too smart to be
561 * Nobody cares about simd float<->int domain transition penalties,
562 * which usually don't even exist for shuffles anyway.
563 * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
564 * going into transpose, which is unpacks, so doesn't really matter
566 * With 2x32bit or 4x16bit fetch, we use float vec, since those
567 * go into the weird channel separation shuffle. With floats,
568 * this is (with 128bit vectors):
569 * - 2 movq, 2 movhpd, 2 shufps
570 * With ints it would be:
571 * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
572 * I've seen texture functions increase in code size by 15% just due
573 * to that (there's lots of such fetches in them...)
574 * (We could chose a different gather order to improve this somewhat
575 * for the int path, but it would basically just drop the blends,
576 * so the float path with this order really is optimal.)
577 * Albeit it is tricky sometimes llvm doesn't ignore the float->int
578 * casts so must avoid them until we're done with the float shuffle...
579 * 3x16bit formats (the same is also true for 3x8) are pretty bad but
580 * there's nothing we can do about them (we could overallocate by
581 * those couple bytes and use unaligned but pot sized load).
582 * Note that this is very much x86 specific. I don't know if this
583 * affect other archs at all.
585 if (num_gather
> 1) {
587 * We always want some float type here (with x86)
588 * due to shuffles being float ones afterwards (albeit for
589 * the num_gather == 4 case int should work fine too
590 * (unless there's some problems with avx but not avx2).
592 if (format_desc
->channel
[0].size
== 64) {
593 fetch_type
= lp_type_float_vec(64, gather_type
.width
);
595 fetch_type
= lp_type_int_vec(32, gather_type
.width
);
599 /* type doesn't matter much */
600 if (format_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_FLOAT
&&
601 (format_desc
->channel
[0].size
== 32 ||
602 format_desc
->channel
[0].size
== 64)) {
603 fetch_type
= lp_type_float(gather_type
.width
);
605 fetch_type
= lp_type_uint(gather_type
.width
);
609 /* Now finally gather the values */
610 packed
[i
] = lp_build_gather(gallivm
, gather_type
.length
,
611 format_desc
->block
.bits
,
613 base_ptr
, offsetr
, FALSE
);
615 struct lp_type conv_type
= type
;
616 conv_type
.width
*= 2;
617 packed
[i
] = LLVMBuildBitCast(builder
, packed
[i
],
618 lp_build_vec_type(gallivm
, conv_type
), "");
619 packed
[i
] = LLVMBuildFPTrunc(builder
, packed
[i
], bld
.vec_type
, "");
623 /* shuffle the gathered values to SoA */
624 if (num_gather
== 2) {
625 for (i
= 0; i
< num_gather
; i
++) {
626 for (j
= 0; j
< type
.length
; j
++) {
627 unsigned idx
= (j
%2)*2 + (j
/4)*4 + i
;
630 shuffles
[j
] = lp_build_const_int32(gallivm
, idx
);
632 dst
[i
] = LLVMBuildShuffleVector(builder
, packed
[0], packed
[1],
633 LLVMConstVector(shuffles
, type
.length
), "");
636 else if (num_gather
== 4) {
637 lp_build_transpose_aos(gallivm
, lp_int_type(type
), packed
, dst
);
640 assert(num_gather
== 1);
645 * And finally unpack exactly as above, except that
646 * chan shift is adjusted and the right vector selected.
649 for (i
= 0; i
< num_gather
; i
++) {
650 dst
[i
] = LLVMBuildBitCast(builder
, dst
[i
], bld
.int_vec_type
, "");
652 for (i
= 0; i
< format_desc
->nr_channels
; i
++) {
653 struct util_format_channel_description chan_desc
= format_desc
->channel
[i
];
654 unsigned blockbits
= type
.width
;
657 #if UTIL_ARCH_BIG_ENDIAN
658 vec_nr
= (format_desc
->block
.bits
- (chan_desc
.shift
+ chan_desc
.size
)) / type
.width
;
660 vec_nr
= chan_desc
.shift
/ type
.width
;
662 chan_desc
.shift
%= type
.width
;
664 output
[i
] = lp_build_extract_soa_chan(&bld
,
672 for (i
= 0; i
< format_desc
->nr_channels
; i
++) {
677 lp_build_format_swizzle_soa(format_desc
, &bld
, output
, rgba_out
);
681 if (format
== PIPE_FORMAT_R11G11B10_FLOAT
||
682 format
== PIPE_FORMAT_R9G9B9E5_FLOAT
) {
684 * similar conceptually to above but requiring special
685 * AoS packed -> SoA float conversion code.
688 struct lp_type fetch_type
= lp_type_uint(type
.width
);
690 assert(type
.floating
);
691 assert(type
.width
== 32);
693 packed
= lp_build_gather(gallivm
, type
.length
,
694 format_desc
->block
.bits
,
696 base_ptr
, offset
, FALSE
);
697 if (format
== PIPE_FORMAT_R11G11B10_FLOAT
) {
698 lp_build_r11g11b10_to_float(gallivm
, packed
, rgba_out
);
701 lp_build_rgb9e5_to_float(gallivm
, packed
, rgba_out
);
706 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_ZS
&&
707 format_desc
->block
.bits
== 64) {
709 * special case the format is 64 bits but we only require
710 * 32bit (or 8bit) from each block.
713 struct lp_type fetch_type
= lp_type_uint(type
.width
);
715 if (format
== PIPE_FORMAT_X32_S8X24_UINT
) {
717 * for stencil simply fix up offsets - could in fact change
718 * base_ptr instead even outside the shader.
720 unsigned mask
= (1 << 8) - 1;
721 LLVMValueRef s_offset
= lp_build_const_int_vec(gallivm
, type
, 4);
722 offset
= LLVMBuildAdd(builder
, offset
, s_offset
, "");
723 packed
= lp_build_gather(gallivm
, type
.length
, 32, fetch_type
,
724 aligned
, base_ptr
, offset
, FALSE
);
725 packed
= LLVMBuildAnd(builder
, packed
,
726 lp_build_const_int_vec(gallivm
, type
, mask
), "");
729 assert (format
== PIPE_FORMAT_Z32_FLOAT_S8X24_UINT
);
730 packed
= lp_build_gather(gallivm
, type
.length
, 32, fetch_type
,
731 aligned
, base_ptr
, offset
, TRUE
);
732 packed
= LLVMBuildBitCast(builder
, packed
,
733 lp_build_vec_type(gallivm
, type
), "");
735 /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
736 rgba_out
[0] = rgba_out
[1] = rgba_out
[2] = packed
;
737 rgba_out
[3] = lp_build_const_vec(gallivm
, type
, 1.0f
);
742 * Try calling lp_build_fetch_rgba_aos for all pixels.
743 * Should only really hit subsampled, compressed
744 * (for s3tc srgb and rgtc too).
745 * (This is invalid for plain 8unorm formats because we're lazy with
746 * the swizzle since some results would arrive swizzled, some not.)
749 if ((format_desc
->layout
!= UTIL_FORMAT_LAYOUT_PLAIN
) &&
750 (util_format_fits_8unorm(format_desc
) ||
751 format_desc
->layout
== UTIL_FORMAT_LAYOUT_RGTC
||
752 format_desc
->layout
== UTIL_FORMAT_LAYOUT_S3TC
) &&
753 type
.floating
&& type
.width
== 32 &&
754 (type
.length
== 1 || (type
.length
% 4 == 0))) {
755 struct lp_type tmp_type
;
756 struct lp_build_context bld
;
757 LLVMValueRef packed
, rgba
[4];
758 const struct util_format_description
*flinear_desc
;
759 const struct util_format_description
*frgba8_desc
;
761 bool is_signed
= (format_desc
->format
== PIPE_FORMAT_RGTC1_SNORM
||
762 format_desc
->format
== PIPE_FORMAT_RGTC2_SNORM
||
763 format_desc
->format
== PIPE_FORMAT_LATC1_SNORM
||
764 format_desc
->format
== PIPE_FORMAT_LATC2_SNORM
);
766 lp_build_context_init(&bld
, gallivm
, type
);
769 * Make sure the conversion in aos really only does convert to rgba8
770 * and not anything more (so use linear format, adjust type).
772 flinear_desc
= util_format_description(util_format_linear(format
));
773 memset(&tmp_type
, 0, sizeof tmp_type
);
775 tmp_type
.length
= type
.length
* 4;
776 tmp_type
.norm
= TRUE
;
777 tmp_type
.sign
= is_signed
;
779 packed
= lp_build_fetch_rgba_aos(gallivm
, flinear_desc
, tmp_type
,
780 aligned
, base_ptr
, offset
, i
, j
, cache
);
781 packed
= LLVMBuildBitCast(builder
, packed
, bld
.int_vec_type
, "");
784 * The values are now packed so they match ordinary (srgb) RGBA8 format,
785 * hence need to use matching format for unpack.
787 frgba8_desc
= util_format_description(is_signed
? PIPE_FORMAT_R8G8B8A8_SNORM
: PIPE_FORMAT_R8G8B8A8_UNORM
);
788 if (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_SRGB
) {
789 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_S3TC
);
790 frgba8_desc
= util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB
);
792 lp_build_unpack_rgba_soa(gallivm
,
798 * We converted 4 channels. Make sure llvm can drop unneeded ones
799 * (luckily the rgba order is fixed, only LA needs special case).
801 for (chan
= 0; chan
< 4; chan
++) {
802 enum pipe_swizzle swizzle
= format_desc
->swizzle
[chan
];
803 if (chan
== 3 && util_format_is_luminance_alpha(format
)) {
804 swizzle
= PIPE_SWIZZLE_W
;
806 rgba_out
[chan
] = lp_build_swizzle_soa_channel(&bld
, rgba
, swizzle
);
813 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
815 * This is not the most efficient way of fetching pixels, as we
816 * miss some opportunities to do vectorization, but this is
817 * convenient for formats or scenarios for which there was no
818 * opportunity or incentive to optimize.
820 * We do NOT want to end up here, this typically is quite terrible,
821 * in particular if the formats have less than 4 channels.
823 * Right now, this should only be hit for:
825 * (those miss fast fetch functions hence they are terrible anyway)
830 struct lp_type tmp_type
;
831 LLVMValueRef aos_fetch
[LP_MAX_VECTOR_WIDTH
/ 32];
833 if (gallivm_debug
& GALLIVM_DEBUG_PERF
) {
834 debug_printf("%s: AoS fetch fallback for %s\n",
835 __FUNCTION__
, format_desc
->short_name
);
842 * Note that vector transpose can be worse compared to insert/extract
843 * for aos->soa conversion (for formats with 1 or 2 channels). However,
844 * we should try to avoid getting here for just about all formats, so
848 /* loop over number of pixels */
849 for(k
= 0; k
< type
.length
; ++k
) {
850 LLVMValueRef index
= lp_build_const_int32(gallivm
, k
);
851 LLVMValueRef offset_elem
;
852 LLVMValueRef i_elem
, j_elem
;
854 offset_elem
= LLVMBuildExtractElement(builder
, offset
,
857 i_elem
= LLVMBuildExtractElement(builder
, i
, index
, "");
858 j_elem
= LLVMBuildExtractElement(builder
, j
, index
, "");
860 /* Get a single float[4]={R,G,B,A} pixel */
861 aos_fetch
[k
] = lp_build_fetch_rgba_aos(gallivm
, format_desc
, tmp_type
,
862 aligned
, base_ptr
, offset_elem
,
863 i_elem
, j_elem
, cache
);
866 convert_to_soa(gallivm
, aos_fetch
, rgba_out
, type
);
871 lp_build_insert_soa_chan(struct lp_build_context
*bld
,
873 struct util_format_channel_description chan_desc
,
874 LLVMValueRef
*output
,
877 struct gallivm_state
*gallivm
= bld
->gallivm
;
878 LLVMBuilderRef builder
= gallivm
->builder
;
879 struct lp_type type
= bld
->type
;
880 const unsigned width
= chan_desc
.size
;
881 const unsigned start
= chan_desc
.shift
;
882 const uint32_t chan_mask
= (1ULL << width
) - 1;
883 ASSERTED
const unsigned stop
= start
+ width
;
884 LLVMValueRef chan
= NULL
;
885 switch(chan_desc
.type
) {
886 case UTIL_FORMAT_TYPE_UNSIGNED
:
888 if (chan_desc
.pure_integer
) {
889 chan
= LLVMBuildBitCast(builder
, rgba
, bld
->int_vec_type
, "");
890 LLVMValueRef mask_val
= lp_build_const_int_vec(gallivm
, type
, chan_mask
);
891 LLVMValueRef mask
= LLVMBuildICmp(builder
, LLVMIntUGT
, chan
, mask_val
, "");
892 chan
= LLVMBuildSelect(builder
, mask
, mask_val
, chan
, "");
894 else if (type
.floating
) {
895 if (chan_desc
.normalized
) {
896 rgba
= lp_build_clamp(bld
, rgba
, bld
->zero
, bld
->one
);
897 chan
= lp_build_clamped_float_to_unsigned_norm(gallivm
, type
, width
, rgba
);
899 chan
= LLVMBuildFPToSI(builder
, rgba
, bld
->vec_type
, "");
902 chan
= LLVMBuildShl(builder
, chan
,
903 lp_build_const_int_vec(gallivm
, type
, start
), "");
907 *output
= LLVMBuildOr(builder
, *output
, chan
, "");
909 case UTIL_FORMAT_TYPE_SIGNED
:
910 if (chan_desc
.pure_integer
) {
911 chan
= LLVMBuildBitCast(builder
, rgba
, bld
->int_vec_type
, "");
912 chan
= LLVMBuildAnd(builder
, chan
, lp_build_const_int_vec(gallivm
, type
, chan_mask
), "");
913 } else if (type
.floating
) {
914 if (chan_desc
.normalized
) {
916 double scale
= ((1 << (chan_desc
.size
- 1)) - 1);
917 LLVMValueRef scale_val
= lp_build_const_vec(gallivm
, type
, scale
);
918 rgba
= lp_build_clamp(bld
, rgba
, lp_build_negate(bld
, bld
->one
), bld
->one
);
919 rgba
= LLVMBuildFMul(builder
, rgba
, scale_val
, "");
920 lp_format_intrinsic(intrin
, sizeof intrin
, "llvm.rint", bld
->vec_type
);
921 rgba
= lp_build_intrinsic_unary(builder
, intrin
, bld
->vec_type
, rgba
);
923 chan
= LLVMBuildFPToSI(builder
, rgba
, bld
->int_vec_type
, "");
924 chan
= LLVMBuildAnd(builder
, chan
, lp_build_const_int_vec(gallivm
, type
, chan_mask
), "");
927 chan
= LLVMBuildShl(builder
, chan
,
928 lp_build_const_int_vec(gallivm
, type
, start
), "");
932 *output
= LLVMBuildOr(builder
, *output
, chan
, "");
934 case UTIL_FORMAT_TYPE_FLOAT
:
936 if (chan_desc
.size
== 16) {
937 chan
= lp_build_float_to_half(gallivm
, rgba
);
938 chan
= LLVMBuildZExt(builder
, chan
, bld
->int_vec_type
, "");
940 chan
= LLVMBuildShl(builder
, chan
,
941 lp_build_const_int_vec(gallivm
, type
, start
), "");
945 *output
= LLVMBuildOr(builder
, *output
, chan
, "");
949 assert(type
.width
== 32);
950 *output
= LLVMBuildBitCast(builder
, rgba
, bld
->int_vec_type
, "");
957 *output
= bld
->undef
;
962 lp_build_pack_rgba_soa(struct gallivm_state
*gallivm
,
963 const struct util_format_description
*format_desc
,
965 const LLVMValueRef rgba_in
[4],
966 LLVMValueRef
*packed
)
969 struct lp_build_context bld
;
970 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
);
971 assert(format_desc
->block
.width
== 1);
972 assert(format_desc
->block
.height
== 1);
973 assert(format_desc
->block
.bits
<= type
.width
);
974 /* FIXME: Support more output types */
975 assert(type
.width
== 32);
977 lp_build_context_init(&bld
, gallivm
, type
);
978 for (chan
= 0; chan
< format_desc
->nr_channels
; ++chan
) {
979 struct util_format_channel_description chan_desc
= format_desc
->channel
[chan
];
981 lp_build_insert_soa_chan(&bld
, format_desc
->block
.bits
,
989 lp_build_store_rgba_soa(struct gallivm_state
*gallivm
,
990 const struct util_format_description
*format_desc
,
992 LLVMValueRef exec_mask
,
993 LLVMValueRef base_ptr
,
995 LLVMValueRef out_of_bounds
,
996 const LLVMValueRef rgba_in
[4])
998 enum pipe_format format
= format_desc
->format
;
999 LLVMValueRef packed
[4];
1000 unsigned num_stores
= 0;
1002 memset(packed
, 0, sizeof(LLVMValueRef
) * 4);
1003 if (format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
&&
1004 format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_RGB
&&
1005 format_desc
->block
.width
== 1 &&
1006 format_desc
->block
.height
== 1 &&
1007 format_desc
->block
.bits
<= type
.width
&&
1008 (format_desc
->channel
[0].type
!= UTIL_FORMAT_TYPE_FLOAT
||
1009 format_desc
->channel
[0].size
== 32 ||
1010 format_desc
->channel
[0].size
== 16))
1012 lp_build_pack_rgba_soa(gallivm
, format_desc
, type
, rgba_in
, &packed
[0]);
1015 } else if (format_desc
->layout
== UTIL_FORMAT_LAYOUT_PLAIN
&&
1016 (format_desc
->colorspace
== UTIL_FORMAT_COLORSPACE_RGB
) &&
1017 format_desc
->block
.width
== 1 &&
1018 format_desc
->block
.height
== 1 &&
1019 format_desc
->block
.bits
> type
.width
&&
1020 ((format_desc
->block
.bits
<= type
.width
* type
.length
&&
1021 format_desc
->channel
[0].size
<= type
.width
) ||
1022 (format_desc
->channel
[0].size
== 64 &&
1023 format_desc
->channel
[0].type
== UTIL_FORMAT_TYPE_FLOAT
&&
1027 * Similar to above, but the packed pixel is larger than what fits
1028 * into an element of the destination format. The packed pixels will be
1029 * shuffled into SoA vectors appropriately, and then the extraction will
1030 * be done in parallel as much as possible.
1031 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
1032 * the gathered vectors can be shuffled easily (even with avx).
1033 * 64xn float -> 32xn float is handled too but it's a bit special as
1034 * it does the conversion pre-shuffle.
1036 struct lp_build_context bld
;
1038 lp_build_context_init(&bld
, gallivm
, type
);
1039 assert(type
.width
== 32);
1040 assert(format_desc
->block
.bits
> type
.width
);
1042 unsigned store_width
= util_next_power_of_two(format_desc
->block
.bits
);
1043 num_stores
= store_width
/ type
.width
;
1044 for (unsigned i
= 0; i
< format_desc
->nr_channels
; i
++) {
1045 struct util_format_channel_description chan_desc
= format_desc
->channel
[i
];
1046 unsigned blockbits
= type
.width
;
1049 vec_nr
= chan_desc
.shift
/ type
.width
;
1050 chan_desc
.shift
%= type
.width
;
1052 lp_build_insert_soa_chan(&bld
, blockbits
,
1058 assert(num_stores
== 4 || num_stores
== 2);
1059 /* we can transpose and store at the same time */
1060 } else if (format
== PIPE_FORMAT_R11G11B10_FLOAT
) {
1061 packed
[0] = lp_build_float_to_r11g11b10(gallivm
, rgba_in
);
1068 LLVMTypeRef int32_ptr_type
= LLVMPointerType(LLVMInt32TypeInContext(gallivm
->context
), 0);
1069 LLVMTypeRef int16_ptr_type
= LLVMPointerType(LLVMInt16TypeInContext(gallivm
->context
), 0);
1070 LLVMTypeRef int8_ptr_type
= LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0);
1072 LLVMValueRef should_store_mask
= LLVMBuildAnd(gallivm
->builder
, exec_mask
, LLVMBuildNot(gallivm
->builder
, out_of_bounds
, ""), "store_mask");
1073 should_store_mask
= LLVMBuildICmp(gallivm
->builder
, LLVMIntNE
, should_store_mask
, lp_build_const_int_vec(gallivm
, type
, 0), "");
1074 for (unsigned i
= 0; i
< num_stores
; i
++) {
1075 struct lp_build_loop_state loop_state
;
1077 LLVMValueRef store_offset
= LLVMBuildAdd(gallivm
->builder
, offset
, lp_build_const_int_vec(gallivm
, type
, i
* 4), "");
1078 store_offset
= LLVMBuildGEP(gallivm
->builder
, base_ptr
, &store_offset
, 1, "");
1080 lp_build_loop_begin(&loop_state
, gallivm
, lp_build_const_int32(gallivm
, 0));
1082 struct lp_build_if_state ifthen
;
1083 LLVMValueRef cond
= LLVMBuildExtractElement(gallivm
->builder
, should_store_mask
, loop_state
.counter
, "");
1084 lp_build_if(&ifthen
, gallivm
, cond
);
1086 LLVMValueRef data
= LLVMBuildExtractElement(gallivm
->builder
, packed
[i
], loop_state
.counter
, "");
1087 LLVMValueRef this_offset
= LLVMBuildExtractElement(gallivm
->builder
, store_offset
, loop_state
.counter
, "");
1089 if (format_desc
->block
.bits
== 8) {
1090 this_offset
= LLVMBuildBitCast(gallivm
->builder
, this_offset
, int8_ptr_type
, "");
1091 data
= LLVMBuildTrunc(gallivm
->builder
, data
, LLVMInt8TypeInContext(gallivm
->context
), "");
1092 } else if (format_desc
->block
.bits
== 16) {
1093 this_offset
= LLVMBuildBitCast(gallivm
->builder
, this_offset
, int16_ptr_type
, "");
1094 data
= LLVMBuildTrunc(gallivm
->builder
, data
, LLVMInt16TypeInContext(gallivm
->context
), "");
1096 this_offset
= LLVMBuildBitCast(gallivm
->builder
, this_offset
, int32_ptr_type
, "");
1097 LLVMBuildStore(gallivm
->builder
, data
, this_offset
);
1098 lp_build_endif(&ifthen
);
1099 lp_build_loop_end_cond(&loop_state
, lp_build_const_int32(gallivm
, type
.length
),