#include "pipe/p_defines.h"
-#include "util/u_format.h"
+#include "util/format/u_format.h"
#include "util/u_memory.h"
#include "util/u_string.h"
+#include "util/u_math.h"
#include "lp_bld_type.h"
#include "lp_bld_const.h"
#include "lp_bld_debug.h"
#include "lp_bld_format.h"
#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_printf.h"
+#include "lp_bld_intr.h"
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+ LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+ LLVMValueRef dst_soa[4],
+ const struct lp_type soa_type)
+{
+ unsigned j, k;
+ struct lp_type aos_channel_type = soa_type;
+
+ LLVMValueRef aos_channels[4];
+ unsigned pixels_per_channel = soa_type.length / 4;
+
+ debug_assert((soa_type.length % 4) == 0);
+
+ aos_channel_type.length >>= 1;
+
+ for (j = 0; j < 4; ++j) {
+ LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+ assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+ for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[j + 4 * k];
+ }
+
+ aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
+ }
+
+ lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
void
const LLVMValueRef *unswizzled,
LLVMValueRef swizzled_out[4])
{
- assert(UTIL_FORMAT_SWIZZLE_0 == PIPE_SWIZZLE_ZERO);
- assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE);
-
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
- enum util_format_swizzle swizzle;
+ enum pipe_swizzle swizzle;
LLVMValueRef depth_or_stencil;
if (util_format_has_stencil(format_desc) &&
else {
unsigned chan;
for (chan = 0; chan < 4; ++chan) {
- enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+ enum pipe_swizzle swizzle = format_desc->swizzle[chan];
swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
}
}
}
+
+static LLVMValueRef
+lp_build_extract_soa_chan(struct lp_build_context *bld,
+ unsigned blockbits,
+ boolean srgb_chan,
+ struct util_format_channel_description chan_desc,
+ LLVMValueRef packed)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_type type = bld->type;
+ LLVMValueRef input = packed;
+ const unsigned width = chan_desc.size;
+ const unsigned start = chan_desc.shift;
+ const unsigned stop = start + width;
+
+ /* Decode the input vector component */
+
+ switch(chan_desc.type) {
+ case UTIL_FORMAT_TYPE_VOID:
+ input = bld->undef;
+ break;
+
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+ /*
+ * Align the LSB
+ */
+ if (start) {
+ input = LLVMBuildLShr(builder, input,
+ lp_build_const_int_vec(gallivm, type, start), "");
+ }
+
+ /*
+ * Zero the MSBs
+ */
+ if (stop < blockbits) {
+ unsigned mask = ((unsigned long long)1 << width) - 1;
+ input = LLVMBuildAnd(builder, input,
+ lp_build_const_int_vec(gallivm, type, mask), "");
+ }
+
+ /*
+ * Type conversion
+ */
+ if (type.floating) {
+ if (srgb_chan) {
+ struct lp_type conv_type = lp_uint_type(type);
+ input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
+ }
+ else {
+ if(chan_desc.normalized)
+ input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
+ else
+ input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ }
+ }
+ else if (chan_desc.pure_integer) {
+ /* Nothing to do */
+ } else {
+ /* FIXME */
+ assert(0);
+ }
+ break;
+
+ case UTIL_FORMAT_TYPE_SIGNED:
+ /*
+ * Align the sign bit first.
+ */
+ if (stop < type.width) {
+ unsigned bits = type.width - stop;
+ LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+ input = LLVMBuildShl(builder, input, bits_val, "");
+ }
+
+ /*
+ * Align the LSB (with an arithmetic shift to preserve the sign)
+ */
+ if (chan_desc.size < type.width) {
+ unsigned bits = type.width - chan_desc.size;
+ LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+ input = LLVMBuildAShr(builder, input, bits_val, "");
+ }
+
+ /*
+ * Type conversion
+ */
+ if (type.floating) {
+ input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ if (chan_desc.normalized) {
+ double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
+ LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+ input = LLVMBuildFMul(builder, input, scale_val, "");
+ /*
+ * The formula above will produce value below -1.0 for most negative
+ * value but everything seems happy with that hence disable for now.
+ */
+ if (0)
+ input = lp_build_max(bld, input,
+ lp_build_const_vec(gallivm, type, -1.0f));
+ }
+ }
+ else if (chan_desc.pure_integer) {
+ /* Nothing to do */
+ } else {
+ /* FIXME */
+ assert(0);
+ }
+ break;
+
+ case UTIL_FORMAT_TYPE_FLOAT:
+ if (type.floating) {
+ if (chan_desc.size == 16) {
+ struct lp_type f16i_type = type;
+ f16i_type.width /= 2;
+ f16i_type.floating = 0;
+ if (start) {
+ input = LLVMBuildLShr(builder, input,
+ lp_build_const_int_vec(gallivm, type, start), "");
+ }
+ input = LLVMBuildTrunc(builder, input,
+ lp_build_vec_type(gallivm, f16i_type), "");
+ input = lp_build_half_to_float(gallivm, input);
+ } else {
+ assert(start == 0);
+ assert(stop == 32);
+ assert(type.width == 32);
+ }
+ input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
+ }
+ else {
+ /* FIXME */
+ assert(0);
+ input = bld->undef;
+ }
+ break;
+
+ case UTIL_FORMAT_TYPE_FIXED:
+ if (type.floating) {
+ double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
+ LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+ input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ input = LLVMBuildFMul(builder, input, scale_val, "");
+ }
+ else {
+ /* FIXME */
+ assert(0);
+ input = bld->undef;
+ }
+ break;
+
+ default:
+ assert(0);
+ input = bld->undef;
+ break;
+ }
+
+ return input;
+}
+
+
/**
* Unpack several pixels in SoA.
*
LLVMValueRef packed,
LLVMValueRef rgba_out[4])
{
- LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context bld;
LLVMValueRef inputs[4];
unsigned chan;
/* Decode the input vector components */
for (chan = 0; chan < format_desc->nr_channels; ++chan) {
- const unsigned width = format_desc->channel[chan].size;
- const unsigned start = format_desc->channel[chan].shift;
- const unsigned stop = start + width;
- LLVMValueRef input;
-
- input = packed;
-
- switch(format_desc->channel[chan].type) {
- case UTIL_FORMAT_TYPE_VOID:
- input = lp_build_undef(gallivm, type);
- break;
-
- case UTIL_FORMAT_TYPE_UNSIGNED:
- /*
- * Align the LSB
- */
-
- if (start) {
- input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), "");
- }
-
- /*
- * Zero the MSBs
- */
-
- if (stop < format_desc->block.bits) {
- unsigned mask = ((unsigned long long)1 << width) - 1;
- input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), "");
- }
-
- /*
- * Type conversion
- */
-
- if (type.floating) {
- if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
- if (format_desc->swizzle[3] == chan) {
- input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
- }
- else {
- struct lp_type conv_type = lp_uint_type(type);
- input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
- }
- }
- else {
- if(format_desc->channel[chan].normalized)
- input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
- else
- input = LLVMBuildSIToFP(builder, input,
- lp_build_vec_type(gallivm, type), "");
- }
- }
- else if (format_desc->channel[chan].pure_integer) {
- /* Nothing to do */
- } else {
- /* FIXME */
- assert(0);
- }
-
- break;
-
- case UTIL_FORMAT_TYPE_SIGNED:
- /*
- * Align the sign bit first.
- */
-
- if (stop < type.width) {
- unsigned bits = type.width - stop;
- LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
- input = LLVMBuildShl(builder, input, bits_val, "");
- }
-
- /*
- * Align the LSB (with an arithmetic shift to preserve the sign)
- */
-
- if (format_desc->channel[chan].size < type.width) {
- unsigned bits = type.width - format_desc->channel[chan].size;
- LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
- input = LLVMBuildAShr(builder, input, bits_val, "");
- }
-
- /*
- * Type conversion
- */
-
- if (type.floating) {
- input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
- if (format_desc->channel[chan].normalized) {
- double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
- LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
- input = LLVMBuildFMul(builder, input, scale_val, "");
- /* the formula above will produce value below -1.0 for most negative
- * value but everything seems happy with that hence disable for now */
- if (0)
- input = lp_build_max(&bld, input,
- lp_build_const_vec(gallivm, type, -1.0f));
- }
- }
- else if (format_desc->channel[chan].pure_integer) {
- /* Nothing to do */
- } else {
- /* FIXME */
- assert(0);
- }
-
- break;
-
- case UTIL_FORMAT_TYPE_FLOAT:
- if (type.floating) {
- assert(start == 0);
- assert(stop == 32);
- assert(type.width == 32);
- input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), "");
- }
- else {
- /* FIXME */
- assert(0);
- input = lp_build_undef(gallivm, type);
- }
- break;
-
- case UTIL_FORMAT_TYPE_FIXED:
- if (type.floating) {
- double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
- LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
- input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), "");
- input = LLVMBuildFMul(builder, input, scale_val, "");
- }
- else {
- /* FIXME */
- assert(0);
- input = lp_build_undef(gallivm, type);
- }
- break;
+ struct util_format_channel_description chan_desc = format_desc->channel[chan];
+ boolean srgb_chan = FALSE;
- default:
- assert(0);
- input = lp_build_undef(gallivm, type);
- break;
+ if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+ format_desc->swizzle[3] != chan) {
+ srgb_chan = TRUE;
}
- inputs[chan] = input;
+ inputs[chan] = lp_build_extract_soa_chan(&bld,
+ format_desc->block.bits,
+ srgb_chan,
+ chan_desc,
+ packed);
}
lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
/* Decode the input vector components */
for (chan = 0; chan < 4; ++chan) {
-#ifdef PIPE_ARCH_LITTLE_ENDIAN
+#if UTIL_ARCH_LITTLE_ENDIAN
unsigned start = chan*8;
#else
unsigned start = (3-chan)*8;
*
* \param type the desired return type for 'rgba'. The vector length
* is the number of texels to fetch
+ * \param aligned if the offset is guaranteed to be aligned to element width
*
* \param base_ptr points to the base of the texture mip tree.
* \param offset offset to start of the texture image block. For non-
* \param i, j the sub-block pixel coordinates. For non-compressed formats
* these will always be (0,0). For compressed formats, i will
* be in [0, block_width-1] and j will be in [0, block_height-1].
+ * \param cache optional value pointing to a lp_build_format_cache structure
*/
void
lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
const struct util_format_description *format_desc,
struct lp_type type,
+ boolean aligned,
LLVMValueRef base_ptr,
LLVMValueRef offset,
LLVMValueRef i,
LLVMValueRef j,
+ LLVMValueRef cache,
LLVMValueRef rgba_out[4])
{
LLVMBuilderRef builder = gallivm->builder;
+ enum pipe_format format = format_desc->format;
+ struct lp_type fetch_type;
if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
format_desc->block.height == 1 &&
format_desc->block.bits <= type.width &&
(format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
- format_desc->channel[0].size == 32))
+ format_desc->channel[0].size == 32 ||
+ format_desc->channel[0].size == 16))
{
/*
* The packed pixel fits into an element of the destination format. Put
* Ex: packed = {XYZW, XYZW, XYZW, XYZW}
*/
assert(format_desc->block.bits <= type.width);
+ fetch_type = lp_type_uint(type.width);
packed = lp_build_gather(gallivm,
type.length,
format_desc->block.bits,
- type.width,
+ fetch_type,
+ aligned,
base_ptr, offset, FALSE);
/*
return;
}
- if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
- format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+ (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
+ format_desc->block.width == 1 &&
+ format_desc->block.height == 1 &&
+ format_desc->block.bits > type.width &&
+ ((format_desc->block.bits <= type.width * type.length &&
+ format_desc->channel[0].size <= type.width) ||
+ (format_desc->channel[0].size == 64 &&
+ format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+ type.floating)))
+ {
+ /*
+ * Similar to above, but the packed pixel is larger than what fits
+ * into an element of the destination format. The packed pixels will be
+ * shuffled into SoA vectors appropriately, and then the extraction will
+ * be done in parallel as much as possible.
+ * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
+ * the gathered vectors can be shuffled easily (even with avx).
+ * 64xn float -> 32xn float is handled too but it's a bit special as
+ * it does the conversion pre-shuffle.
+ */
+
+ LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
+ struct lp_type fetch_type, gather_type = type;
+ unsigned num_gather, fetch_width, i, j;
+ struct lp_build_context bld;
+ boolean fp64 = format_desc->channel[0].size == 64;
+
+ lp_build_context_init(&bld, gallivm, type);
+
+ assert(type.width == 32);
+ assert(format_desc->block.bits > type.width);
+
+ /*
+ * First, figure out fetch order.
+ */
+ fetch_width = util_next_power_of_two(format_desc->block.bits);
+ /*
+ * fp64 are treated like fp32 except we fetch twice wide values
+ * (as we shuffle after trunc). The shuffles for that work out
+ * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
+ * albeit we miss the potential opportunity for hw gather (as it
+ * only handles native size).
+ */
+ num_gather = fetch_width / type.width;
+ gather_type.width *= num_gather;
+ if (fp64) {
+ num_gather /= 2;
+ }
+ gather_type.length /= num_gather;
+
+ for (i = 0; i < num_gather; i++) {
+ LLVMValueRef offsetr, shuf_vec;
+ if(num_gather == 4) {
+ for (j = 0; j < gather_type.length; j++) {
+ unsigned idx = i + 4*j;
+ shuffles[j] = lp_build_const_int32(gallivm, idx);
+ }
+ shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+ offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+
+ }
+ else if (num_gather == 2) {
+ assert(num_gather == 2);
+ for (j = 0; j < gather_type.length; j++) {
+ unsigned idx = i*2 + (j%2) + (j/2)*4;
+ shuffles[j] = lp_build_const_int32(gallivm, idx);
+ }
+ shuf_vec = LLVMConstVector(shuffles, gather_type.length);
+ offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
+ }
+ else {
+ assert(num_gather == 1);
+ offsetr = offset;
+ }
+ if (gather_type.length == 1) {
+ LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+ offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
+ }
+
+ /*
+ * Determine whether to use float or int loads. This is mostly
+ * to outsmart the (stupid) llvm int/float shuffle logic, we
+ * don't really care much if the data is floats or ints...
+ * But llvm will refuse to use single float shuffle with int data
+ * and instead use 3 int shuffles instead, the code looks atrocious.
+ * (Note bitcasts often won't help, as llvm is too smart to be
+ * fooled by that.)
+ * Nobody cares about simd float<->int domain transition penalties,
+ * which usually don't even exist for shuffles anyway.
+ * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
+ * going into transpose, which is unpacks, so doesn't really matter
+ * much).
+ * With 2x32bit or 4x16bit fetch, we use float vec, since those
+ * go into the weird channel separation shuffle. With floats,
+ * this is (with 128bit vectors):
+ * - 2 movq, 2 movhpd, 2 shufps
+ * With ints it would be:
+ * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
+ * I've seen texture functions increase in code size by 15% just due
+ * to that (there's lots of such fetches in them...)
+ * (We could chose a different gather order to improve this somewhat
+ * for the int path, but it would basically just drop the blends,
+ * so the float path with this order really is optimal.)
+ * Albeit it is tricky sometimes llvm doesn't ignore the float->int
+ * casts so must avoid them until we're done with the float shuffle...
+ * 3x16bit formats (the same is also true for 3x8) are pretty bad but
+ * there's nothing we can do about them (we could overallocate by
+ * those couple bytes and use unaligned but pot sized load).
+ * Note that this is very much x86 specific. I don't know if this
+ * affect other archs at all.
+ */
+ if (num_gather > 1) {
+ /*
+ * We always want some float type here (with x86)
+ * due to shuffles being float ones afterwards (albeit for
+ * the num_gather == 4 case int should work fine too
+ * (unless there's some problems with avx but not avx2).
+ */
+ if (format_desc->channel[0].size == 64) {
+ fetch_type = lp_type_float_vec(64, gather_type.width);
+ } else {
+ fetch_type = lp_type_int_vec(32, gather_type.width);
+ }
+ }
+ else {
+ /* type doesn't matter much */
+ if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+ (format_desc->channel[0].size == 32 ||
+ format_desc->channel[0].size == 64)) {
+ fetch_type = lp_type_float(gather_type.width);
+ } else {
+ fetch_type = lp_type_uint(gather_type.width);
+ }
+ }
+
+ /* Now finally gather the values */
+ packed[i] = lp_build_gather(gallivm, gather_type.length,
+ format_desc->block.bits,
+ fetch_type, aligned,
+ base_ptr, offsetr, FALSE);
+ if (fp64) {
+ struct lp_type conv_type = type;
+ conv_type.width *= 2;
+ packed[i] = LLVMBuildBitCast(builder, packed[i],
+ lp_build_vec_type(gallivm, conv_type), "");
+ packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
+ }
+ }
+
+ /* shuffle the gathered values to SoA */
+ if (num_gather == 2) {
+ for (i = 0; i < num_gather; i++) {
+ for (j = 0; j < type.length; j++) {
+ unsigned idx = (j%2)*2 + (j/4)*4 + i;
+ if ((j/2)%2)
+ idx += type.length;
+ shuffles[j] = lp_build_const_int32(gallivm, idx);
+ }
+ dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
+ LLVMConstVector(shuffles, type.length), "");
+ }
+ }
+ else if (num_gather == 4) {
+ lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
+ }
+ else {
+ assert(num_gather == 1);
+ dst[0] = packed[0];
+ }
+
+ /*
+ * And finally unpack exactly as above, except that
+ * chan shift is adjusted and the right vector selected.
+ */
+ if (!fp64) {
+ for (i = 0; i < num_gather; i++) {
+ dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
+ }
+ for (i = 0; i < format_desc->nr_channels; i++) {
+ struct util_format_channel_description chan_desc = format_desc->channel[i];
+ unsigned blockbits = type.width;
+ unsigned vec_nr;
+
+#if UTIL_ARCH_BIG_ENDIAN
+ vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
+#else
+ vec_nr = chan_desc.shift / type.width;
+#endif
+ chan_desc.shift %= type.width;
+
+ output[i] = lp_build_extract_soa_chan(&bld,
+ blockbits,
+ FALSE,
+ chan_desc,
+ dst[vec_nr]);
+ }
+ }
+ else {
+ for (i = 0; i < format_desc->nr_channels; i++) {
+ output[i] = dst[i];
+ }
+ }
+
+ lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
+ return;
+ }
+
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+ format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
/*
* similar conceptually to above but requiring special
* AoS packed -> SoA float conversion code.
*/
LLVMValueRef packed;
+ struct lp_type fetch_type = lp_type_uint(type.width);
assert(type.floating);
assert(type.width == 32);
packed = lp_build_gather(gallivm, type.length,
format_desc->block.bits,
- type.width, base_ptr, offset,
- FALSE);
- if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ fetch_type, aligned,
+ base_ptr, offset, FALSE);
+ if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
}
else {
* 32bit (or 8bit) from each block.
*/
LLVMValueRef packed;
+ struct lp_type fetch_type = lp_type_uint(type.width);
- if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) {
+ if (format == PIPE_FORMAT_X32_S8X24_UINT) {
/*
* for stencil simply fix up offsets - could in fact change
* base_ptr instead even outside the shader.
unsigned mask = (1 << 8) - 1;
LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
offset = LLVMBuildAdd(builder, offset, s_offset, "");
- packed = lp_build_gather(gallivm, type.length,
- 32, type.width, base_ptr, offset, FALSE);
+ packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+ aligned, base_ptr, offset, FALSE);
packed = LLVMBuildAnd(builder, packed,
lp_build_const_int_vec(gallivm, type, mask), "");
}
else {
- assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
- packed = lp_build_gather(gallivm, type.length,
- 32, type.width, base_ptr, offset, TRUE);
+ assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+ packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
+ aligned, base_ptr, offset, TRUE);
packed = LLVMBuildBitCast(builder, packed,
lp_build_vec_type(gallivm, type), "");
}
/*
* Try calling lp_build_fetch_rgba_aos for all pixels.
+ * Should only really hit subsampled, compressed
+ * (for s3tc srgb too, for rgtc the unorm ones only) by now.
+ * (This is invalid for plain 8unorm formats because we're lazy with
+ * the swizzle since some results would arrive swizzled, some not.)
*/
- if (util_format_fits_8unorm(format_desc) &&
+ if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
+ (util_format_fits_8unorm(format_desc) ||
+ format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
type.floating && type.width == 32 &&
(type.length == 1 || (type.length % 4 == 0))) {
struct lp_type tmp_type;
- LLVMValueRef tmp;
+ struct lp_build_context bld;
+ LLVMValueRef packed, rgba[4];
+ const struct util_format_description *flinear_desc;
+ const struct util_format_description *frgba8_desc;
+ unsigned chan;
+
+ lp_build_context_init(&bld, gallivm, type);
+ /*
+ * Make sure the conversion in aos really only does convert to rgba8
+ * and not anything more (so use linear format, adjust type).
+ */
+ flinear_desc = util_format_description(util_format_linear(format));
memset(&tmp_type, 0, sizeof tmp_type);
tmp_type.width = 8;
tmp_type.length = type.length * 4;
tmp_type.norm = TRUE;
- tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
- base_ptr, offset, i, j);
+ packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
+ aligned, base_ptr, offset, i, j, cache);
+ packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
- lp_build_rgba8_to_fi32_soa(gallivm,
- type,
- tmp,
- rgba_out);
+ /*
+ * The values are now packed so they match ordinary (srgb) RGBA8 format,
+ * hence need to use matching format for unpack.
+ */
+ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
+ if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+ assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
+ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+ }
+ lp_build_unpack_rgba_soa(gallivm,
+ frgba8_desc,
+ type,
+ packed, rgba);
+ /*
+ * We converted 4 channels. Make sure llvm can drop unneeded ones
+ * (luckily the rgba order is fixed, only LA needs special case).
+ */
+ for (chan = 0; chan < 4; chan++) {
+ enum pipe_swizzle swizzle = format_desc->swizzle[chan];
+ if (chan == 3 && util_format_is_luminance_alpha(format)) {
+ swizzle = PIPE_SWIZZLE_W;
+ }
+ rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
+ }
return;
}
+
/*
* Fallback to calling lp_build_fetch_rgba_aos for each pixel.
*
* miss some opportunities to do vectorization, but this is
* convenient for formats or scenarios for which there was no
* opportunity or incentive to optimize.
+ *
+ * We do NOT want to end up here, this typically is quite terrible,
+ * in particular if the formats have less than 4 channels.
+ *
+ * Right now, this should only be hit for:
+ * - RGTC snorm formats
+ * (those miss fast fetch functions hence they are terrible anyway)
*/
{
- unsigned k, chan;
+ unsigned k;
struct lp_type tmp_type;
+ LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
if (gallivm_debug & GALLIVM_DEBUG_PERF) {
- debug_printf("%s: scalar unpacking of %s\n",
+ debug_printf("%s: AoS fetch fallback for %s\n",
__FUNCTION__, format_desc->short_name);
}
tmp_type = type;
tmp_type.length = 4;
- for (chan = 0; chan < 4; ++chan) {
- rgba_out[chan] = lp_build_undef(gallivm, type);
- }
+ /*
+ * Note that vector transpose can be worse compared to insert/extract
+ * for aos->soa conversion (for formats with 1 or 2 channels). However,
+ * we should try to avoid getting here for just about all formats, so
+ * don't bother.
+ */
/* loop over number of pixels */
for(k = 0; k < type.length; ++k) {
LLVMValueRef index = lp_build_const_int32(gallivm, k);
LLVMValueRef offset_elem;
LLVMValueRef i_elem, j_elem;
- LLVMValueRef tmp;
offset_elem = LLVMBuildExtractElement(builder, offset,
index, "");
j_elem = LLVMBuildExtractElement(builder, j, index, "");
/* Get a single float[4]={R,G,B,A} pixel */
- tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
- base_ptr, offset_elem,
- i_elem, j_elem);
+ aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+ aligned, base_ptr, offset_elem,
+ i_elem, j_elem, cache);
- /*
- * Insert the AoS tmp value channels into the SoA result vectors at
- * position = 'index'.
- */
- for (chan = 0; chan < 4; ++chan) {
- LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan),
- tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
- rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
- tmp_chan, index, "");
- }
}
+ convert_to_soa(gallivm, aos_fetch, rgba_out, type);
+ }
+}
+
+static void
+lp_build_insert_soa_chan(struct lp_build_context *bld,
+ unsigned blockbits,
+ struct util_format_channel_description chan_desc,
+ LLVMValueRef *output,
+ LLVMValueRef rgba)
+{
+ struct gallivm_state *gallivm = bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_type type = bld->type;
+ const unsigned width = chan_desc.size;
+ const unsigned start = chan_desc.shift;
+ const unsigned stop = start + width;
+ LLVMValueRef chan;
+ switch(chan_desc.type) {
+ case UTIL_FORMAT_TYPE_UNSIGNED:
+
+ if (chan_desc.pure_integer)
+ chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
+ else if (type.floating) {
+ if (chan_desc.normalized)
+ chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
+ else
+ chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
+ }
+ if (start)
+ chan = LLVMBuildShl(builder, chan,
+ lp_build_const_int_vec(gallivm, type, start), "");
+ if (!*output)
+ *output = chan;
+ else
+ *output = LLVMBuildOr(builder, *output, chan, "");
+ break;
+ case UTIL_FORMAT_TYPE_SIGNED:
+ if (chan_desc.pure_integer)
+ chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
+ else if (type.floating) {
+ uint32_t mask_val = (1UL << chan_desc.size) - 1;
+ if (chan_desc.normalized) {
+ char intrin[32];
+ double scale = ((1 << (chan_desc.size - 1)) - 1);
+ LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
+ rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
+ rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
+ lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
+ rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
+ }
+ chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
+ chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, mask_val), "");
+ }
+ if (start)
+ chan = LLVMBuildShl(builder, chan,
+ lp_build_const_int_vec(gallivm, type, start), "");
+ if (!*output)
+ *output = chan;
+ else
+ *output = LLVMBuildOr(builder, *output, chan, "");
+ break;
+ case UTIL_FORMAT_TYPE_FLOAT:
+ if (type.floating) {
+ if (chan_desc.size == 16) {
+ chan = lp_build_float_to_half(gallivm, rgba);
+ chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
+ if (start)
+ chan = LLVMBuildShl(builder, chan,
+ lp_build_const_int_vec(gallivm, type, start), "");
+ if (!*output)
+ *output = chan;
+ else
+ *output = LLVMBuildOr(builder, *output, chan, "");
+ } else {
+ assert(start == 0);
+ assert(stop == 32);
+ assert(type.width == 32);
+ *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
+ }
+ } else
+ assert(0);
+ break;
+ default:
+ assert(0);
+ *output = bld->undef;
+ }
+}
+
+static void
+lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type type,
+ const LLVMValueRef rgba_in[4],
+ LLVMValueRef *packed)
+{
+ unsigned chan;
+ struct lp_build_context bld;
+ assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
+ assert(format_desc->block.width == 1);
+ assert(format_desc->block.height == 1);
+ assert(format_desc->block.bits <= type.width);
+ /* FIXME: Support more output types */
+ assert(type.width == 32);
+
+ lp_build_context_init(&bld, gallivm, type);
+ for (chan = 0; chan < format_desc->nr_channels; ++chan) {
+ struct util_format_channel_description chan_desc = format_desc->channel[chan];
+
+ lp_build_insert_soa_chan(&bld, format_desc->block.bits,
+ chan_desc,
+ packed,
+ rgba_in[chan]);
+ }
+}
+
+void
+lp_build_store_rgba_soa(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type type,
+ LLVMValueRef exec_mask,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset,
+ LLVMValueRef out_of_bounds,
+ const LLVMValueRef rgba_in[4])
+{
+ enum pipe_format format = format_desc->format;
+ LLVMValueRef packed[4];
+ unsigned num_stores;
+
+ memset(packed, 0, sizeof(LLVMValueRef) * 4);
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+ format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+ format_desc->block.width == 1 &&
+ format_desc->block.height == 1 &&
+ format_desc->block.bits <= type.width &&
+ (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
+ format_desc->channel[0].size == 32 ||
+ format_desc->channel[0].size == 16))
+ {
+ lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
+
+ num_stores = 1;
+ } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+ (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
+ format_desc->block.width == 1 &&
+ format_desc->block.height == 1 &&
+ format_desc->block.bits > type.width &&
+ ((format_desc->block.bits <= type.width * type.length &&
+ format_desc->channel[0].size <= type.width) ||
+ (format_desc->channel[0].size == 64 &&
+ format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
+ type.floating)))
+ {
+ /*
+ * Similar to above, but the packed pixel is larger than what fits
+ * into an element of the destination format. The packed pixels will be
+ * shuffled into SoA vectors appropriately, and then the extraction will
+ * be done in parallel as much as possible.
+ * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
+ * the gathered vectors can be shuffled easily (even with avx).
+ * 64xn float -> 32xn float is handled too but it's a bit special as
+ * it does the conversion pre-shuffle.
+ */
+ struct lp_build_context bld;
+
+ lp_build_context_init(&bld, gallivm, type);
+ assert(type.width == 32);
+ assert(format_desc->block.bits > type.width);
+
+ unsigned store_width = util_next_power_of_two(format_desc->block.bits);
+ num_stores = store_width / type.width;
+ for (unsigned i = 0; i < format_desc->nr_channels; i++) {
+ struct util_format_channel_description chan_desc = format_desc->channel[i];
+ unsigned blockbits = type.width;
+ unsigned vec_nr;
+
+ vec_nr = chan_desc.shift / type.width;
+ chan_desc.shift %= type.width;
+
+ lp_build_insert_soa_chan(&bld, blockbits,
+ chan_desc,
+ &packed[vec_nr],
+ rgba_in[i]);
+ }
+
+ assert(num_stores == 4 || num_stores == 2);
+ /* we can transpose and store at the same time */
+ } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+ packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
+ num_stores = 1;
+ } else
+ assert(0);
+
+ assert(exec_mask);
+
+ LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
+ LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
+ LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
+
+ LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
+ should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
+ for (unsigned i = 0; i < num_stores; i++) {
+ struct lp_build_loop_state loop_state;
+
+ LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
+ store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
+
+ lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
+
+ struct lp_build_if_state ifthen;
+ LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
+ lp_build_if(&ifthen, gallivm, cond);
+
+ LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
+ LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
+
+ if (format_desc->block.bits == 8) {
+ this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
+ data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
+ } else if (format_desc->block.bits == 16) {
+ this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
+ data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
+ } else
+ this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
+ LLVMBuildStore(gallivm->builder, data, this_offset);
+ lp_build_endif(&ifthen);
+ lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
+ NULL, LLVMIntUGE);
}
}