X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_format_soa.c;h=c5962a666998d48d62e70337b5f2b99d67212665;hb=7730d583c207002e14ca2e95d30cab181db20082;hp=2b66162eb40fdb0fd29a15bd0b56b0b9b3324eb3;hpb=8f3bdeaad610d7d5a5c6e73e1e9c721219595754;p=mesa.git diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index 2b66162eb40..c5962a66699 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -26,64 +26,256 @@ **************************************************************************/ -#include "util/u_format.h" +#include "pipe/p_defines.h" + +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_string.h" +#include "util/u_math.h" #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_conv.h" -#include "lp_bld_sample.h" /* for lp_build_gather */ -#include "lp_bld_init.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_gather.h" +#include "lp_bld_debug.h" #include "lp_bld_format.h" +#include "lp_bld_arit.h" +#include "lp_bld_pack.h" +#include "lp_bld_flow.h" +#include "lp_bld_printf.h" +#include "lp_bld_intr.h" - -static LLVMValueRef -lp_build_format_swizzle_chan_soa(struct lp_type type, - const LLVMValueRef *unswizzled, - enum util_format_swizzle swizzle) +static void +convert_to_soa(struct gallivm_state *gallivm, + LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32], + LLVMValueRef dst_soa[4], + const struct lp_type soa_type) { - switch (swizzle) { - case UTIL_FORMAT_SWIZZLE_X: - case UTIL_FORMAT_SWIZZLE_Y: - case UTIL_FORMAT_SWIZZLE_Z: - case UTIL_FORMAT_SWIZZLE_W: - return unswizzled[swizzle]; - case UTIL_FORMAT_SWIZZLE_0: - return lp_build_zero(type); - case UTIL_FORMAT_SWIZZLE_1: - return lp_build_one(type); - case UTIL_FORMAT_SWIZZLE_NONE: - return lp_build_undef(type); - default: - assert(0); - return lp_build_undef(type); + unsigned j, k; + struct lp_type aos_channel_type = soa_type; + + LLVMValueRef aos_channels[4]; + unsigned pixels_per_channel = soa_type.length / 4; + + debug_assert((soa_type.length % 4) == 0); + + aos_channel_type.length >>= 1; + + for (j = 0; j < 4; ++j) { + LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 }; + + assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH); + + for (k = 0; k < pixels_per_channel; ++k) { + channel[k] = src_aos[j + 4 * k]; + } + + aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel); } + + lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa); } void lp_build_format_swizzle_soa(const struct util_format_description *format_desc, - struct lp_type type, + struct lp_build_context *bld, const LLVMValueRef *unswizzled, - LLVMValueRef *swizzled) + LLVMValueRef swizzled_out[4]) { - if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - enum util_format_swizzle swizzle = format_desc->swizzle[0]; - LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle); - swizzled[2] = swizzled[1] = swizzled[0] = depth; - swizzled[3] = lp_build_one(type); + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + enum pipe_swizzle swizzle; + LLVMValueRef depth_or_stencil; + + if (util_format_has_stencil(format_desc) && + !util_format_has_depth(format_desc)) { + assert(!bld->type.floating); + swizzle = format_desc->swizzle[1]; + } + else { + assert(bld->type.floating); + swizzle = format_desc->swizzle[0]; + } + /* + * Return zzz1 or sss1 for depth-stencil formats here. + * Correct swizzling will be handled by apply_sampler_swizzle() later. + */ + depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle); + + swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil; + swizzled_out[3] = bld->one; } else { unsigned chan; for (chan = 0; chan < 4; ++chan) { - enum util_format_swizzle swizzle = format_desc->swizzle[chan]; - swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle); + enum pipe_swizzle swizzle = format_desc->swizzle[chan]; + swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle); } } } + +static LLVMValueRef +lp_build_extract_soa_chan(struct lp_build_context *bld, + unsigned blockbits, + boolean srgb_chan, + struct util_format_channel_description chan_desc, + LLVMValueRef packed) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_type type = bld->type; + LLVMValueRef input = packed; + const unsigned width = chan_desc.size; + const unsigned start = chan_desc.shift; + const unsigned stop = start + width; + + /* Decode the input vector component */ + + switch(chan_desc.type) { + case UTIL_FORMAT_TYPE_VOID: + input = bld->undef; + break; + + case UTIL_FORMAT_TYPE_UNSIGNED: + /* + * Align the LSB + */ + if (start) { + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(gallivm, type, start), ""); + } + + /* + * Zero the MSBs + */ + if (stop < blockbits) { + unsigned mask = ((unsigned long long)1 << width) - 1; + input = LLVMBuildAnd(builder, input, + lp_build_const_int_vec(gallivm, type, mask), ""); + } + + /* + * Type conversion + */ + if (type.floating) { + if (srgb_chan) { + struct lp_type conv_type = lp_uint_type(type); + input = lp_build_srgb_to_linear(gallivm, conv_type, width, input); + } + else { + if(chan_desc.normalized) + input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); + else + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + } + } + else if (chan_desc.pure_integer) { + /* Nothing to do */ + } else { + /* FIXME */ + assert(0); + } + break; + + case UTIL_FORMAT_TYPE_SIGNED: + /* + * Align the sign bit first. + */ + if (stop < type.width) { + unsigned bits = type.width - stop; + LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); + input = LLVMBuildShl(builder, input, bits_val, ""); + } + + /* + * Align the LSB (with an arithmetic shift to preserve the sign) + */ + if (chan_desc.size < type.width) { + unsigned bits = type.width - chan_desc.size; + LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); + input = LLVMBuildAShr(builder, input, bits_val, ""); + } + + /* + * Type conversion + */ + if (type.floating) { + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + if (chan_desc.normalized) { + double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + input = LLVMBuildFMul(builder, input, scale_val, ""); + /* + * The formula above will produce value below -1.0 for most negative + * value but everything seems happy with that hence disable for now. + */ + if (0) + input = lp_build_max(bld, input, + lp_build_const_vec(gallivm, type, -1.0f)); + } + } + else if (chan_desc.pure_integer) { + /* Nothing to do */ + } else { + /* FIXME */ + assert(0); + } + break; + + case UTIL_FORMAT_TYPE_FLOAT: + if (type.floating) { + if (chan_desc.size == 16) { + struct lp_type f16i_type = type; + f16i_type.width /= 2; + f16i_type.floating = 0; + if (start) { + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(gallivm, type, start), ""); + } + input = LLVMBuildTrunc(builder, input, + lp_build_vec_type(gallivm, f16i_type), ""); + input = lp_build_half_to_float(gallivm, input); + } else { + assert(start == 0); + assert(stop == 32); + assert(type.width == 32); + } + input = LLVMBuildBitCast(builder, input, bld->vec_type, ""); + } + else { + /* FIXME */ + assert(0); + input = bld->undef; + } + break; + + case UTIL_FORMAT_TYPE_FIXED: + if (type.floating) { + double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); + input = LLVMBuildFMul(builder, input, scale_val, ""); + } + else { + /* FIXME */ + assert(0); + input = bld->undef; + } + break; + + default: + assert(0); + input = bld->undef; + break; + } + + return input; +} + + /** * Unpack several pixels in SoA. * @@ -101,16 +293,21 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc, * It requires that a packed pixel fits into an element of the output * channels. The common case is when converting pixel with a depth of 32 bit or * less into floats. + * + * \param format_desc the format of the 'packed' incoming pixel vector + * \param type the desired type for rgba_out (type.length = n, above) + * \param packed the incoming vector of packed pixels + * \param rgba_out returns the SoA R,G,B,A vectors */ void -lp_build_unpack_rgba_soa(LLVMBuilderRef builder, +lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef packed, - LLVMValueRef *rgba) + LLVMValueRef rgba_out[4]) { + struct lp_build_context bld; LLVMValueRef inputs[4]; - unsigned start; unsigned chan; assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); @@ -118,171 +315,132 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, assert(format_desc->block.height == 1); assert(format_desc->block.bits <= type.width); /* FIXME: Support more output types */ - assert(type.floating); assert(type.width == 32); + lp_build_context_init(&bld, gallivm, type); + /* Decode the input vector components */ - start = 0; for (chan = 0; chan < format_desc->nr_channels; ++chan) { - unsigned width = format_desc->channel[chan].size; - unsigned stop = start + width; - LLVMValueRef input; - - input = packed; - - switch(format_desc->channel[chan].type) { - case UTIL_FORMAT_TYPE_VOID: - input = lp_build_undef(type); - break; - - case UTIL_FORMAT_TYPE_UNSIGNED: - /* - * Align the LSB - */ - - if (start) { - input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(type, start), ""); - } - - /* - * Zero the MSBs - */ - - if (stop < format_desc->block.bits) { - unsigned mask = ((unsigned long long)1 << width) - 1; - input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(type, mask), ""); - } + struct util_format_channel_description chan_desc = format_desc->channel[chan]; + boolean srgb_chan = FALSE; - /* - * Type conversion - */ - - if (type.floating) { - if(format_desc->channel[chan].normalized) - input = lp_build_unsigned_norm_to_float(builder, width, type, input); - else - input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), ""); - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(type); - } + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && + format_desc->swizzle[3] != chan) { + srgb_chan = TRUE; + } - break; + inputs[chan] = lp_build_extract_soa_chan(&bld, + format_desc->block.bits, + srgb_chan, + chan_desc, + packed); + } - case UTIL_FORMAT_TYPE_SIGNED: - /* - * Align the sign bit first. - */ + lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); +} - if (stop < type.width) { - unsigned bits = type.width - stop; - LLVMValueRef bits_val = lp_build_const_int_vec(type, bits); - input = LLVMBuildShl(builder, input, bits_val, ""); - } - /* - * Align the LSB (with an arithmetic shift to preserve the sign) - */ - - if (format_desc->channel[chan].size < type.width) { - unsigned bits = type.width - format_desc->channel[chan].size; - LLVMValueRef bits_val = lp_build_const_int_vec(type, bits); - input = LLVMBuildAShr(builder, input, bits_val, ""); - } +/** + * Convert a vector of rgba8 values into 32bit wide SoA vectors. + * + * \param dst_type The desired return type. For pure integer formats + * this should be a 32bit wide int or uint vector type, + * otherwise a float vector type. + * + * \param packed The rgba8 values to pack. + * + * \param rgba The 4 SoA return vectors. + */ +void +lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm, + struct lp_type dst_type, + LLVMValueRef packed, + LLVMValueRef *rgba) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff); + unsigned chan; - /* - * Type conversion - */ + /* XXX technically shouldn't use that for uint dst_type */ + packed = LLVMBuildBitCast(builder, packed, + lp_build_int_vec_type(gallivm, dst_type), ""); - if (type.floating) { - input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), ""); - if (format_desc->channel[chan].normalized) { - double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); - LLVMValueRef scale_val = lp_build_const_vec(type, scale); - input = LLVMBuildMul(builder, input, scale_val, ""); - } - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(type); - } + /* Decode the input vector components */ + for (chan = 0; chan < 4; ++chan) { +#if UTIL_ARCH_LITTLE_ENDIAN + unsigned start = chan*8; +#else + unsigned start = (3-chan)*8; +#endif + unsigned stop = start + 8; + LLVMValueRef input; - break; + input = packed; - case UTIL_FORMAT_TYPE_FLOAT: - if (type.floating) { - assert(start == 0); - assert(stop == 32); - assert(type.width == 32); - input = LLVMBuildBitCast(builder, input, lp_build_vec_type(type), ""); - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(type); - } - break; - - case UTIL_FORMAT_TYPE_FIXED: - if (type.floating) { - double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); - LLVMValueRef scale_val = lp_build_const_vec(type, scale); - input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), ""); - input = LLVMBuildMul(builder, input, scale_val, ""); - } - else { - /* FIXME */ - assert(0); - input = lp_build_undef(type); - } - break; + if (start) + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(gallivm, dst_type, start), ""); - default: - assert(0); - input = lp_build_undef(type); - break; - } + if (stop < 32) + input = LLVMBuildAnd(builder, input, mask, ""); - inputs[chan] = input; + if (dst_type.floating) + input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input); - start = stop; + rgba[chan] = input; } - - lp_build_format_swizzle_soa(format_desc, type, inputs, rgba); } + /** - * Fetch a pixel into a SoA. + * Fetch a texels from a texture, returning them in SoA layout. + * + * \param type the desired return type for 'rgba'. The vector length + * is the number of texels to fetch + * \param aligned if the offset is guaranteed to be aligned to element width * - * i and j are the sub-block pixel coordinates. + * \param base_ptr points to the base of the texture mip tree. + * \param offset offset to start of the texture image block. For non- + * compressed formats, this simply is an offset to the texel. + * For compressed formats, it is an offset to the start of the + * compressed data block. + * + * \param i, j the sub-block pixel coordinates. For non-compressed formats + * these will always be (0,0). For compressed formats, i will + * be in [0, block_width-1] and j will be in [0, block_height-1]. + * \param cache optional value pointing to a lp_build_format_cache structure */ void -lp_build_fetch_rgba_soa(LLVMBuilderRef builder, +lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, + boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, - LLVMValueRef *rgba) + LLVMValueRef cache, + LLVMValueRef rgba_out[4]) { + LLVMBuilderRef builder = gallivm->builder; + enum pipe_format format = format_desc->format; + struct lp_type fetch_type; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || - format_desc->channel[0].size == 32)) + format_desc->channel[0].size == 32 || + format_desc->channel[0].size == 16)) { /* * The packed pixel fits into an element of the destination format. Put - * the packed pixels into a vector and estract each component for all + * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ @@ -290,107 +448,643 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, /* * gather the texels from the texture + * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ - packed = lp_build_gather(builder, + assert(format_desc->block.bits <= type.width); + fetch_type = lp_type_uint(type.width); + packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, - type.width, - base_ptr, offset); + fetch_type, + aligned, + base_ptr, offset, FALSE); /* * convert texels to float rgba */ - lp_build_unpack_rgba_soa(builder, + lp_build_unpack_rgba_soa(gallivm, format_desc, type, - packed, rgba); + packed, rgba_out); + return; } - else { + + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + format_desc->block.bits > type.width && + ((format_desc->block.bits <= type.width * type.length && + format_desc->channel[0].size <= type.width) || + (format_desc->channel[0].size == 64 && + format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + type.floating))) + { + /* + * Similar to above, but the packed pixel is larger than what fits + * into an element of the destination format. The packed pixels will be + * shuffled into SoA vectors appropriately, and then the extraction will + * be done in parallel as much as possible. + * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so + * the gathered vectors can be shuffled easily (even with avx). + * 64xn float -> 32xn float is handled too but it's a bit special as + * it does the conversion pre-shuffle. + */ + + LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32]; + struct lp_type fetch_type, gather_type = type; + unsigned num_gather, fetch_width, i, j; + struct lp_build_context bld; + boolean fp64 = format_desc->channel[0].size == 64; + + lp_build_context_init(&bld, gallivm, type); + + assert(type.width == 32); + assert(format_desc->block.bits > type.width); + /* - * Fallback to calling util_format_description::fetch_rgba_float for each - * pixel. - * - * This is definitely not the most efficient way of fetching pixels, as - * we miss the opportunity to do vectorization, but this it is a - * convenient for formats or scenarios for which there was no opportunity - * or incentive to optimize. + * First, figure out fetch order. */ + fetch_width = util_next_power_of_two(format_desc->block.bits); + /* + * fp64 are treated like fp32 except we fetch twice wide values + * (as we shuffle after trunc). The shuffles for that work out + * mostly fine (slightly suboptimal for 4-wide, perfect for AVX) + * albeit we miss the potential opportunity for hw gather (as it + * only handles native size). + */ + num_gather = fetch_width / type.width; + gather_type.width *= num_gather; + if (fp64) { + num_gather /= 2; + } + gather_type.length /= num_gather; + + for (i = 0; i < num_gather; i++) { + LLVMValueRef offsetr, shuf_vec; + if(num_gather == 4) { + for (j = 0; j < gather_type.length; j++) { + unsigned idx = i + 4*j; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + shuf_vec = LLVMConstVector(shuffles, gather_type.length); + offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); - LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); - char name[256]; - LLVMValueRef function; - LLVMValueRef tmp; - unsigned k, chan; + } + else if (num_gather == 2) { + assert(num_gather == 2); + for (j = 0; j < gather_type.length; j++) { + unsigned idx = i*2 + (j%2) + (j/2)*4; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + shuf_vec = LLVMConstVector(shuffles, gather_type.length); + offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); + } + else { + assert(num_gather == 1); + offsetr = offset; + } + if (gather_type.length == 1) { + LLVMValueRef zero = lp_build_const_int32(gallivm, 0); + offsetr = LLVMBuildExtractElement(builder, offsetr, zero, ""); + } + + /* + * Determine whether to use float or int loads. This is mostly + * to outsmart the (stupid) llvm int/float shuffle logic, we + * don't really care much if the data is floats or ints... + * But llvm will refuse to use single float shuffle with int data + * and instead use 3 int shuffles instead, the code looks atrocious. + * (Note bitcasts often won't help, as llvm is too smart to be + * fooled by that.) + * Nobody cares about simd float<->int domain transition penalties, + * which usually don't even exist for shuffles anyway. + * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is + * going into transpose, which is unpacks, so doesn't really matter + * much). + * With 2x32bit or 4x16bit fetch, we use float vec, since those + * go into the weird channel separation shuffle. With floats, + * this is (with 128bit vectors): + * - 2 movq, 2 movhpd, 2 shufps + * With ints it would be: + * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw + * I've seen texture functions increase in code size by 15% just due + * to that (there's lots of such fetches in them...) + * (We could chose a different gather order to improve this somewhat + * for the int path, but it would basically just drop the blends, + * so the float path with this order really is optimal.) + * Albeit it is tricky sometimes llvm doesn't ignore the float->int + * casts so must avoid them until we're done with the float shuffle... + * 3x16bit formats (the same is also true for 3x8) are pretty bad but + * there's nothing we can do about them (we could overallocate by + * those couple bytes and use unaligned but pot sized load). + * Note that this is very much x86 specific. I don't know if this + * affect other archs at all. + */ + if (num_gather > 1) { + /* + * We always want some float type here (with x86) + * due to shuffles being float ones afterwards (albeit for + * the num_gather == 4 case int should work fine too + * (unless there's some problems with avx but not avx2). + */ + if (format_desc->channel[0].size == 64) { + fetch_type = lp_type_float_vec(64, gather_type.width); + } else { + fetch_type = lp_type_int_vec(32, gather_type.width); + } + } + else { + /* type doesn't matter much */ + if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + (format_desc->channel[0].size == 32 || + format_desc->channel[0].size == 64)) { + fetch_type = lp_type_float(gather_type.width); + } else { + fetch_type = lp_type_uint(gather_type.width); + } + } + + /* Now finally gather the values */ + packed[i] = lp_build_gather(gallivm, gather_type.length, + format_desc->block.bits, + fetch_type, aligned, + base_ptr, offsetr, FALSE); + if (fp64) { + struct lp_type conv_type = type; + conv_type.width *= 2; + packed[i] = LLVMBuildBitCast(builder, packed[i], + lp_build_vec_type(gallivm, conv_type), ""); + packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, ""); + } + } + + /* shuffle the gathered values to SoA */ + if (num_gather == 2) { + for (i = 0; i < num_gather; i++) { + for (j = 0; j < type.length; j++) { + unsigned idx = (j%2)*2 + (j/4)*4 + i; + if ((j/2)%2) + idx += type.length; + shuffles[j] = lp_build_const_int32(gallivm, idx); + } + dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1], + LLVMConstVector(shuffles, type.length), ""); + } + } + else if (num_gather == 4) { + lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst); + } + else { + assert(num_gather == 1); + dst[0] = packed[0]; + } + + /* + * And finally unpack exactly as above, except that + * chan shift is adjusted and the right vector selected. + */ + if (!fp64) { + for (i = 0; i < num_gather; i++) { + dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, ""); + } + for (i = 0; i < format_desc->nr_channels; i++) { + struct util_format_channel_description chan_desc = format_desc->channel[i]; + unsigned blockbits = type.width; + unsigned vec_nr; + +#if UTIL_ARCH_BIG_ENDIAN + vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width; +#else + vec_nr = chan_desc.shift / type.width; +#endif + chan_desc.shift %= type.width; + + output[i] = lp_build_extract_soa_chan(&bld, + blockbits, + FALSE, + chan_desc, + dst[vec_nr]); + } + } + else { + for (i = 0; i < format_desc->nr_channels; i++) { + output[i] = dst[i]; + } + } + + lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out); + return; + } + + if (format == PIPE_FORMAT_R11G11B10_FLOAT || + format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + /* + * similar conceptually to above but requiring special + * AoS packed -> SoA float conversion code. + */ + LLVMValueRef packed; + struct lp_type fetch_type = lp_type_uint(type.width); assert(type.floating); + assert(type.width == 32); - util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", format_desc->short_name); + packed = lp_build_gather(gallivm, type.length, + format_desc->block.bits, + fetch_type, aligned, + base_ptr, offset, FALSE); + if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); + } + else { + lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); + } + return; + } + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && + format_desc->block.bits == 64) { /* - * Declare and bind format_desc->fetch_rgba_float(). + * special case the format is 64 bits but we only require + * 32bit (or 8bit) from each block. */ + LLVMValueRef packed; + struct lp_type fetch_type = lp_type_uint(type.width); + + if (format == PIPE_FORMAT_X32_S8X24_UINT) { + /* + * for stencil simply fix up offsets - could in fact change + * base_ptr instead even outside the shader. + */ + unsigned mask = (1 << 8) - 1; + LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); + offset = LLVMBuildAdd(builder, offset, s_offset, ""); + packed = lp_build_gather(gallivm, type.length, 32, fetch_type, + aligned, base_ptr, offset, FALSE); + packed = LLVMBuildAnd(builder, packed, + lp_build_const_int_vec(gallivm, type, mask), ""); + } + else { + assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); + packed = lp_build_gather(gallivm, type.length, 32, fetch_type, + aligned, base_ptr, offset, TRUE); + packed = LLVMBuildBitCast(builder, packed, + lp_build_vec_type(gallivm, type), ""); + } + /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ + rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; + rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); + return; + } + + /* + * Try calling lp_build_fetch_rgba_aos for all pixels. + * Should only really hit subsampled, compressed + * (for s3tc srgb too, for rgtc the unorm ones only) by now. + * (This is invalid for plain 8unorm formats because we're lazy with + * the swizzle since some results would arrive swizzled, some not.) + */ + + if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) && + (util_format_fits_8unorm(format_desc) || + format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) && + type.floating && type.width == 32 && + (type.length == 1 || (type.length % 4 == 0))) { + struct lp_type tmp_type; + struct lp_build_context bld; + LLVMValueRef packed, rgba[4]; + const struct util_format_description *flinear_desc; + const struct util_format_description *frgba8_desc; + unsigned chan; - function = LLVMGetNamedFunction(module, name); - if (!function) { - LLVMTypeRef ret_type; - LLVMTypeRef arg_types[4]; - LLVMTypeRef function_type; + lp_build_context_init(&bld, gallivm, type); - ret_type = LLVMVoidType(); - arg_types[0] = LLVMPointerType(LLVMFloatType(), 0); - arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0); - arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8); - function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); - function = LLVMAddFunction(module, name, function_type); + /* + * Make sure the conversion in aos really only does convert to rgba8 + * and not anything more (so use linear format, adjust type). + */ + flinear_desc = util_format_description(util_format_linear(format)); + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = type.length * 4; + tmp_type.norm = TRUE; - LLVMSetFunctionCallConv(function, LLVMCCallConv); - LLVMSetLinkage(function, LLVMExternalLinkage); + packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type, + aligned, base_ptr, offset, i, j, cache); + packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, ""); - assert(LLVMIsDeclaration(function)); + /* + * The values are now packed so they match ordinary (srgb) RGBA8 format, + * hence need to use matching format for unpack. + */ + frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM); + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { + assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC); + frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); + } + lp_build_unpack_rgba_soa(gallivm, + frgba8_desc, + type, + packed, rgba); - LLVMAddGlobalMapping(lp_build_engine, function, format_desc->fetch_rgba_float); + /* + * We converted 4 channels. Make sure llvm can drop unneeded ones + * (luckily the rgba order is fixed, only LA needs special case). + */ + for (chan = 0; chan < 4; chan++) { + enum pipe_swizzle swizzle = format_desc->swizzle[chan]; + if (chan == 3 && util_format_is_luminance_alpha(format)) { + swizzle = PIPE_SWIZZLE_W; + } + rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle); } + return; + } - for (chan = 0; chan < 4; ++chan) { - rgba[chan] = lp_build_undef(type); + + /* + * Fallback to calling lp_build_fetch_rgba_aos for each pixel. + * + * This is not the most efficient way of fetching pixels, as we + * miss some opportunities to do vectorization, but this is + * convenient for formats or scenarios for which there was no + * opportunity or incentive to optimize. + * + * We do NOT want to end up here, this typically is quite terrible, + * in particular if the formats have less than 4 channels. + * + * Right now, this should only be hit for: + * - RGTC snorm formats + * (those miss fast fetch functions hence they are terrible anyway) + */ + + { + unsigned k; + struct lp_type tmp_type; + LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; + + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: AoS fetch fallback for %s\n", + __FUNCTION__, format_desc->short_name); } - tmp = LLVMBuildArrayAlloca(builder, - LLVMFloatType(), - LLVMConstInt(LLVMInt32Type(), 4, 0), - ""); + tmp_type = type; + tmp_type.length = 4; /* - * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result - * in the SoA vectors. + * Note that vector transpose can be worse compared to insert/extract + * for aos->soa conversion (for formats with 1 or 2 channels). However, + * we should try to avoid getting here for just about all formats, so + * don't bother. */ + /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); + LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; - LLVMValueRef ptr; LLVMValueRef i_elem, j_elem; - LLVMValueRef args[4]; - offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); - ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, ""); + offset_elem = LLVMBuildExtractElement(builder, offset, + index, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); - args[0] = tmp; - args[1] = ptr; - args[2] = i_elem; - args[3] = j_elem; + /* Get a single float[4]={R,G,B,A} pixel */ + aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, + aligned, base_ptr, offset_elem, + i_elem, j_elem, cache); + + } + convert_to_soa(gallivm, aos_fetch, rgba_out, type); + } +} + +static void +lp_build_insert_soa_chan(struct lp_build_context *bld, + unsigned blockbits, + struct util_format_channel_description chan_desc, + LLVMValueRef *output, + LLVMValueRef rgba) +{ + struct gallivm_state *gallivm = bld->gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_type type = bld->type; + const unsigned width = chan_desc.size; + const unsigned start = chan_desc.shift; + const unsigned stop = start + width; + LLVMValueRef chan; + switch(chan_desc.type) { + case UTIL_FORMAT_TYPE_UNSIGNED: - LLVMBuildCall(builder, function, args, 4, ""); + if (chan_desc.pure_integer) + chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, ""); + else if (type.floating) { + if (chan_desc.normalized) + chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba); + else + chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, ""); + } + if (start) + chan = LLVMBuildShl(builder, chan, + lp_build_const_int_vec(gallivm, type, start), ""); + if (!*output) + *output = chan; + else + *output = LLVMBuildOr(builder, *output, chan, ""); + break; + case UTIL_FORMAT_TYPE_SIGNED: + if (chan_desc.pure_integer) + chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, ""); + else if (type.floating) { + uint32_t mask_val = (1UL << chan_desc.size) - 1; + if (chan_desc.normalized) { + char intrin[32]; + double scale = ((1 << (chan_desc.size - 1)) - 1); + LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); + rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one); + rgba = LLVMBuildFMul(builder, rgba, scale_val, ""); + lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type); + rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba); + } + chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, ""); + chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, mask_val), ""); + } + if (start) + chan = LLVMBuildShl(builder, chan, + lp_build_const_int_vec(gallivm, type, start), ""); + if (!*output) + *output = chan; + else + *output = LLVMBuildOr(builder, *output, chan, ""); + break; + case UTIL_FORMAT_TYPE_FLOAT: + if (type.floating) { + if (chan_desc.size == 16) { + chan = lp_build_float_to_half(gallivm, rgba); + chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, ""); + if (start) + chan = LLVMBuildShl(builder, chan, + lp_build_const_int_vec(gallivm, type, start), ""); + if (!*output) + *output = chan; + else + *output = LLVMBuildOr(builder, *output, chan, ""); + } else { + assert(start == 0); + assert(stop == 32); + assert(type.width == 32); + *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, ""); + } + } else + assert(0); + break; + default: + assert(0); + *output = bld->undef; + } +} - for (chan = 0; chan < 4; ++chan) { - LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0), - tmp_chan = LLVMBuildGEP(builder, tmp, &chan_val, 1, ""); - tmp_chan = LLVMBuildLoad(builder, tmp_chan, ""); - rgba[chan] = LLVMBuildInsertElement(builder, rgba[chan], tmp_chan, index, ""); - } +static void +lp_build_pack_rgba_soa(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + const LLVMValueRef rgba_in[4], + LLVMValueRef *packed) +{ + unsigned chan; + struct lp_build_context bld; + assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); + assert(format_desc->block.width == 1); + assert(format_desc->block.height == 1); + assert(format_desc->block.bits <= type.width); + /* FIXME: Support more output types */ + assert(type.width == 32); + + lp_build_context_init(&bld, gallivm, type); + for (chan = 0; chan < format_desc->nr_channels; ++chan) { + struct util_format_channel_description chan_desc = format_desc->channel[chan]; + + lp_build_insert_soa_chan(&bld, format_desc->block.bits, + chan_desc, + packed, + rgba_in[chan]); + } +} + +void +lp_build_store_rgba_soa(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + LLVMValueRef exec_mask, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef out_of_bounds, + const LLVMValueRef rgba_in[4]) +{ + enum pipe_format format = format_desc->format; + LLVMValueRef packed[4]; + unsigned num_stores; + + memset(packed, 0, sizeof(LLVMValueRef) * 4); + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + format_desc->block.bits <= type.width && + (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || + format_desc->channel[0].size == 32 || + format_desc->channel[0].size == 16)) + { + lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]); + + num_stores = 1; + } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + format_desc->block.bits > type.width && + ((format_desc->block.bits <= type.width * type.length && + format_desc->channel[0].size <= type.width) || + (format_desc->channel[0].size == 64 && + format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && + type.floating))) + { + /* + * Similar to above, but the packed pixel is larger than what fits + * into an element of the destination format. The packed pixels will be + * shuffled into SoA vectors appropriately, and then the extraction will + * be done in parallel as much as possible. + * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so + * the gathered vectors can be shuffled easily (even with avx). + * 64xn float -> 32xn float is handled too but it's a bit special as + * it does the conversion pre-shuffle. + */ + struct lp_build_context bld; + + lp_build_context_init(&bld, gallivm, type); + assert(type.width == 32); + assert(format_desc->block.bits > type.width); + + unsigned store_width = util_next_power_of_two(format_desc->block.bits); + num_stores = store_width / type.width; + for (unsigned i = 0; i < format_desc->nr_channels; i++) { + struct util_format_channel_description chan_desc = format_desc->channel[i]; + unsigned blockbits = type.width; + unsigned vec_nr; + + vec_nr = chan_desc.shift / type.width; + chan_desc.shift %= type.width; + + lp_build_insert_soa_chan(&bld, blockbits, + chan_desc, + &packed[vec_nr], + rgba_in[i]); } + + assert(num_stores == 4 || num_stores == 2); + /* we can transpose and store at the same time */ + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in); + num_stores = 1; + } else + assert(0); + + assert(exec_mask); + + LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0); + LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0); + LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + + LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask"); + should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), ""); + for (unsigned i = 0; i < num_stores; i++) { + struct lp_build_loop_state loop_state; + + LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), ""); + store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, ""); + + lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0)); + + struct lp_build_if_state ifthen; + LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, ""); + lp_build_if(&ifthen, gallivm, cond); + + LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, ""); + LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, ""); + + if (format_desc->block.bits == 8) { + this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, ""); + data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), ""); + } else if (format_desc->block.bits == 16) { + this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, ""); + data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), ""); + } else + this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, ""); + LLVMBuildStore(gallivm->builder, data, this_offset); + lp_build_endif(&ifthen); + lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length), + NULL, LLVMIntUGE); } }