X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_format_aos.c;h=aa99c2a7e2e11bf58b56e31eeb2b3b475a9a08e4;hb=35b22b5da0277ba12fdb45fee79bca75cc6bcf5c;hp=a07f7418f2c16fc51b7f66159f68e2ca0c6b8666;hpb=a8238bb08a95e7ea4430450c304a6bee210df1a6;p=mesa.git diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index a07f7418f2c..aa99c2a7e2e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -33,250 +33,322 @@ */ +#include "util/format/u_format.h" +#include "util/u_memory.h" +#include "util/u_math.h" +#include "util/u_pointer.h" +#include "util/u_string.h" #include "util/u_cpu_detect.h" -#include "util/u_format.h" +#include "lp_bld_arit.h" +#include "lp_bld_init.h" #include "lp_bld_type.h" +#include "lp_bld_flow.h" #include "lp_bld_const.h" +#include "lp_bld_conv.h" #include "lp_bld_swizzle.h" +#include "lp_bld_gather.h" +#include "lp_bld_debug.h" #include "lp_bld_format.h" +#include "lp_bld_pack.h" +#include "lp_bld_intr.h" +#include "lp_bld_logic.h" +#include "lp_bld_bitarit.h" +#include "lp_bld_misc.h" + +/** + * Basic swizzling. Rearrange the order of the unswizzled array elements + * according to the format description. PIPE_SWIZZLE_0/ONE are supported + * too. + * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}. + */ +LLVMValueRef +lp_build_format_swizzle_aos(const struct util_format_description *desc, + struct lp_build_context *bld, + LLVMValueRef unswizzled) +{ + unsigned char swizzles[4]; + unsigned chan; + + assert(bld->type.length % 4 == 0); + + for (chan = 0; chan < 4; ++chan) { + enum pipe_swizzle swizzle; + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + /* + * For ZS formats do RGBA = ZZZ1 + */ + if (chan == 3) { + swizzle = PIPE_SWIZZLE_1; + } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) { + swizzle = PIPE_SWIZZLE_0; + } else { + swizzle = desc->swizzle[0]; + } + } else { + swizzle = desc->swizzle[chan]; + } + swizzles[chan] = swizzle; + } + + return lp_build_swizzle_aos(bld, unswizzled, swizzles); +} /** - * Unpack a single pixel into its RGBA components. - * - * @param packed integer. + * Whether the format matches the vector type, apart of swizzles. + */ +static inline boolean +format_matches_type(const struct util_format_description *desc, + struct lp_type type) +{ + enum util_format_type chan_type; + unsigned chan; + + assert(type.length % 4 == 0); + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || + desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB || + desc->block.width != 1 || + desc->block.height != 1) { + return FALSE; + } + + if (type.floating) { + chan_type = UTIL_FORMAT_TYPE_FLOAT; + } else if (type.fixed) { + chan_type = UTIL_FORMAT_TYPE_FIXED; + } else if (type.sign) { + chan_type = UTIL_FORMAT_TYPE_SIGNED; + } else { + chan_type = UTIL_FORMAT_TYPE_UNSIGNED; + } + + for (chan = 0; chan < desc->nr_channels; ++chan) { + if (desc->channel[chan].size != type.width) { + return FALSE; + } + + if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) { + if (desc->channel[chan].type != chan_type || + desc->channel[chan].normalized != type.norm) { + return FALSE; + } + } + } + + return TRUE; +} + +/* + * Do rounding when converting small unorm values to larger ones. + * Not quite 100% accurate, as it's done by appending MSBs, but + * should be good enough. + */ + +static inline LLVMValueRef +scale_bits_up(struct gallivm_state *gallivm, + int src_bits, + int dst_bits, + LLVMValueRef src, + struct lp_type src_type) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result = src; + + if (src_bits == 1 && dst_bits > 1) { + /* + * Useful for a1 - we'd need quite some repeated copies otherwise. + */ + struct lp_build_context bld; + LLVMValueRef dst_mask; + lp_build_context_init(&bld, gallivm, src_type); + dst_mask = lp_build_const_int_vec(gallivm, src_type, + (1 << dst_bits) - 1), + result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src, + lp_build_const_int_vec(gallivm, src_type, 0)); + result = lp_build_andnot(&bld, dst_mask, result); + } + else if (dst_bits > src_bits) { + /* Scale up bits */ + int db = dst_bits - src_bits; + + /* Shift left by difference in bits */ + result = LLVMBuildShl(builder, + src, + lp_build_const_int_vec(gallivm, src_type, db), + ""); + + if (db <= src_bits) { + /* Enough bits in src to fill the remainder */ + LLVMValueRef lower = LLVMBuildLShr(builder, + src, + lp_build_const_int_vec(gallivm, src_type, + src_bits - db), + ""); + + result = LLVMBuildOr(builder, result, lower, ""); + } else if (db > src_bits) { + /* Need to repeatedly copy src bits to fill remainder in dst */ + unsigned n; + + for (n = src_bits; n < dst_bits; n *= 2) { + LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); + + result = LLVMBuildOr(builder, + result, + LLVMBuildLShr(builder, result, shuv, ""), + ""); + } + } + } else { + assert (dst_bits == src_bits); + } + + return result; +} + +/** + * Unpack a single pixel into its XYZW components. * - * @return RGBA in a 4 floats vector. + * @param desc the pixel format for the packed pixel value + * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM * - * XXX: This is mostly for reference and testing -- operating a single pixel at - * a time is rarely if ever needed. + * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector. */ -LLVMValueRef -lp_build_unpack_rgba_aos(LLVMBuilderRef builder, - const struct util_format_description *desc, - LLVMValueRef packed) +static inline LLVMValueRef +lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, + const struct util_format_description *desc, + LLVMValueRef packed) { - LLVMTypeRef type; + LLVMBuilderRef builder = gallivm->builder; LLVMValueRef shifted, casted, scaled, masked; LLVMValueRef shifts[4]; LLVMValueRef masks[4]; LLVMValueRef scales[4]; - LLVMValueRef swizzles[4]; - LLVMValueRef aux[4]; - bool normalized; - int empty_channel; - unsigned shift; + LLVMTypeRef vec32_type; + + boolean normalized; + boolean needs_uitofp; unsigned i; - /* FIXME: Support more formats */ + /* TODO: Support more formats */ assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); assert(desc->block.width == 1); assert(desc->block.height == 1); assert(desc->block.bits <= 32); - type = LLVMIntType(desc->block.bits); - /* Do the intermediate integer computations with 32bit integers since it * matches floating point size */ - if (desc->block.bits < 32) - packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), ""); + assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context)); + + vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); - /* Broadcast the packed value to all four channels */ - packed = LLVMBuildInsertElement(builder, - LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)), - packed, - LLVMConstNull(LLVMInt32Type()), + /* Broadcast the packed value to all four channels + * before: packed = BGRA + * after: packed = {BGRA, BGRA, BGRA, BGRA} + */ + packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed, + LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)), ""); - packed = LLVMBuildShuffleVector(builder, - packed, - LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)), - LLVMConstNull(LLVMVectorType(LLVMInt32Type(), 4)), + packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type), + LLVMConstNull(vec32_type), ""); /* Initialize vector constants */ normalized = FALSE; - empty_channel = -1; - shift = 0; + needs_uitofp = FALSE; + + /* Loop over 4 color components */ for (i = 0; i < 4; ++i) { unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { - shifts[i] = LLVMGetUndef(LLVMInt32Type()); - masks[i] = LLVMConstNull(LLVMInt32Type()); - scales[i] = LLVMConstNull(LLVMFloatType()); - empty_channel = i; + shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)); + scales[i] = LLVMConstNull(LLVMFloatTypeInContext(gallivm->context)); } else { - unsigned mask = (1 << bits) - 1; + unsigned long long mask = (1ULL << bits) - 1; assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED); - assert(bits < 32); - shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0); - masks[i] = LLVMConstInt(LLVMInt32Type(), mask, 0); + if (bits == 32) { + needs_uitofp = TRUE; + } + + shifts[i] = lp_build_const_int32(gallivm, shift); + masks[i] = lp_build_const_int32(gallivm, mask); if (desc->channel[i].normalized) { - scales[i] = LLVMConstReal(LLVMFloatType(), 1.0/mask); + scales[i] = lp_build_const_float(gallivm, 1.0 / mask); normalized = TRUE; } else - scales[i] = LLVMConstReal(LLVMFloatType(), 1.0); + scales[i] = lp_build_const_float(gallivm, 1.0); } - - shift += bits; } - shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); - masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); - /* UIToFP can't be expressed in SSE2 */ - casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), ""); - - if (normalized) - scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), ""); - else - scaled = casted; - - for (i = 0; i < 4; ++i) - aux[i] = LLVMGetUndef(LLVMFloatType()); - - for (i = 0; i < 4; ++i) { - enum util_format_swizzle swizzle = desc->swizzle[i]; - - switch (swizzle) { - case UTIL_FORMAT_SWIZZLE_X: - case UTIL_FORMAT_SWIZZLE_Y: - case UTIL_FORMAT_SWIZZLE_Z: - case UTIL_FORMAT_SWIZZLE_W: - swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0); - break; - case UTIL_FORMAT_SWIZZLE_0: - assert(empty_channel >= 0); - swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0); - break; - case UTIL_FORMAT_SWIZZLE_1: - swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0); - aux[0] = LLVMConstReal(LLVMFloatType(), 1.0); - break; - case UTIL_FORMAT_SWIZZLE_NONE: - swizzles[i] = LLVMGetUndef(LLVMFloatType()); - assert(0); - break; - } - } - - return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), LLVMConstVector(swizzles, 4), ""); -} - - -/** - * Take a vector with packed pixels and unpack into a rgba8 vector. - * - * Formats with bit depth smaller than 32bits are accepted, but they must be - * padded to 32bits. - */ -LLVMValueRef -lp_build_unpack_rgba8_aos(LLVMBuilderRef builder, - const struct util_format_description *desc, - struct lp_type type, - LLVMValueRef packed) -{ - struct lp_build_context bld; - bool rgba8; - LLVMValueRef res; - unsigned i; - - lp_build_context_init(&bld, builder, type); - - /* FIXME: Support more formats */ - assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); - assert(desc->block.width == 1); - assert(desc->block.height == 1); - assert(desc->block.bits <= 32); - - assert(!type.floating); - assert(!type.fixed); - assert(type.norm); - assert(type.width == 8); - assert(type.length % 4 == 0); - - rgba8 = TRUE; - for(i = 0; i < 4; ++i) { - assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || - desc->channel[i].type == UTIL_FORMAT_TYPE_VOID); - if(desc->channel[0].size != 8) - rgba8 = FALSE; - } - - if(rgba8) { + /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW} + * into masked = {X, Y, Z, W} + */ + if (desc->block.bits < 32 && normalized) { /* - * The pixel is already in a rgba8 format variant. All it is necessary - * is to swizzle the channels. + * Note: we cannot do the shift below on x86 natively until AVX2. + * + * Old llvm versions will resort to scalar extract/shift insert, + * which is definitely terrible, new versions will just do + * several vector shifts and shuffle/blend results together. + * We could turn this into a variable left shift plus a constant + * right shift, and llvm would then turn the variable left shift + * into a mul for us (albeit without sse41 the mul needs emulation + * too...). However, since we're going to do a float mul + * anyway, we just adjust that mul instead (plus the mask), skipping + * the shift completely. + * We could also use a extra mul when the format isn't normalized and + * we don't have AVX2 support, but don't bother for now. Unfortunately, + * this strategy doesn't work for 32bit formats (such as rgb10a2 or even + * rgba8 if it ends up here), as that would require UIToFP, albeit that + * would be fixable with easy 16bit shuffle (unless there's channels + * crossing 16bit boundaries). */ - - unsigned char swizzles[4]; - boolean zeros[4]; /* bitwise AND mask */ - boolean ones[4]; /* bitwise OR mask */ - boolean swizzles_needed = FALSE; - boolean zeros_needed = FALSE; - boolean ones_needed = FALSE; - - for(i = 0; i < 4; ++i) { - enum util_format_swizzle swizzle = desc->swizzle[i]; - - /* Initialize with the no-op case */ - swizzles[i] = util_cpu_caps.little_endian ? 3 - i : i; - zeros[i] = TRUE; - ones[i] = FALSE; - - switch (swizzle) { - case UTIL_FORMAT_SWIZZLE_X: - case UTIL_FORMAT_SWIZZLE_Y: - case UTIL_FORMAT_SWIZZLE_Z: - case UTIL_FORMAT_SWIZZLE_W: - if(swizzle != swizzles[i]) { - swizzles[i] = swizzle; - swizzles_needed = TRUE; - } - break; - case UTIL_FORMAT_SWIZZLE_0: - zeros[i] = FALSE; - zeros_needed = TRUE; - break; - case UTIL_FORMAT_SWIZZLE_1: - ones[i] = TRUE; - ones_needed = TRUE; - break; - case UTIL_FORMAT_SWIZZLE_NONE: - assert(0); - break; + for (i = 0; i < 4; ++i) { + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { + unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; + unsigned long long mask = ((1ULL << bits) - 1) << shift; + scales[i] = lp_build_const_float(gallivm, 1.0 / mask); + masks[i] = lp_build_const_int32(gallivm, mask); } } + masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), ""); + } else { + shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); + masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + } - res = packed; - - if(swizzles_needed) - res = lp_build_swizzle1_aos(&bld, res, swizzles); + if (!needs_uitofp) { + /* UIToFP can't be expressed in SSE2 */ + casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); + } else { + casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); + } - if(zeros_needed) { - /* Mask out zero channels */ - LLVMValueRef mask = lp_build_const_mask_aos(type, zeros); - res = LLVMBuildAnd(builder, res, mask, ""); - } + /* + * At this point 'casted' may be a vector of floats such as + * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied + * by powers of two). Next, if the pixel values are normalized + * we'll scale this to {1.0, 1.0, 1.0, 1.0}. + */ - if(ones_needed) { - /* Or one channels */ - LLVMValueRef mask = lp_build_const_mask_aos(type, ones); - res = LLVMBuildOr(builder, res, mask, ""); - } - } - else { - /* FIXME */ - assert(0); - res = lp_build_undef(type); - } + if (normalized) + scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), ""); + else + scaled = casted; - return res; + return scaled; } @@ -289,25 +361,25 @@ lp_build_unpack_rgba8_aos(LLVMBuilderRef builder, * a time is rarely if ever needed. */ LLVMValueRef -lp_build_pack_rgba_aos(LLVMBuilderRef builder, +lp_build_pack_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *desc, LLVMValueRef rgba) { + LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef type; LLVMValueRef packed = NULL; LLVMValueRef swizzles[4]; LLVMValueRef shifted, casted, scaled, unswizzled; LLVMValueRef shifts[4]; LLVMValueRef scales[4]; - bool normalized; - unsigned shift; + boolean normalized; unsigned i, j; assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); assert(desc->block.width == 1); assert(desc->block.height == 1); - type = LLVMIntType(desc->block.bits); + type = LLVMIntTypeInContext(gallivm->context, desc->block.bits); /* Unswizzle the color components into the source vector. */ for (i = 0; i < 4; ++i) { @@ -316,23 +388,23 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, break; } if (j < 4) - swizzles[i] = LLVMConstInt(LLVMInt32Type(), j, 0); + swizzles[i] = lp_build_const_int32(gallivm, j); else - swizzles[i] = LLVMGetUndef(LLVMInt32Type()); + swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); } unswizzled = LLVMBuildShuffleVector(builder, rgba, - LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)), + LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)), LLVMConstVector(swizzles, 4), ""); normalized = FALSE; - shift = 0; for (i = 0; i < 4; ++i) { unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { - shifts[i] = LLVMGetUndef(LLVMInt32Type()); - scales[i] = LLVMGetUndef(LLVMFloatType()); + shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + scales[i] = LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context)); } else { unsigned mask = (1 << bits) - 1; @@ -340,32 +412,31 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(bits < 32); - shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0); + shifts[i] = lp_build_const_int32(gallivm, shift); if (desc->channel[i].normalized) { - scales[i] = LLVMConstReal(LLVMFloatType(), mask); + scales[i] = lp_build_const_float(gallivm, mask); normalized = TRUE; } else - scales[i] = LLVMConstReal(LLVMFloatType(), 1.0); + scales[i] = lp_build_const_float(gallivm, 1.0); } - - shift += bits; } if (normalized) - scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), ""); + scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), ""); else scaled = unswizzled; - casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32Type(), 4), ""); + casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), ""); shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), ""); /* Bitwise or all components */ for (i = 0; i < 4; ++i) { if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) { - LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, LLVMConstInt(LLVMInt32Type(), i, 0), ""); + LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, + lp_build_const_int32(gallivm, i), ""); if (packed) packed = LLVMBuildOr(builder, packed, component, ""); else @@ -374,10 +445,550 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, } if (!packed) - packed = LLVMGetUndef(LLVMInt32Type()); + packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); if (desc->block.bits < 32) packed = LLVMBuildTrunc(builder, packed, type, ""); return packed; } + + + + +/** + * Fetch a pixel into a 4 float AoS. + * + * \param format_desc describes format of the image we're fetching from + * \param aligned whether the data is guaranteed to be aligned + * \param ptr address of the pixel block (or the texel if uncompressed) + * \param i, j the sub-block pixel coordinates. For non-compressed formats + * these will always be (0, 0). + * \param cache optional value pointing to a lp_build_format_cache structure + * \return a 4 element vector with the pixel's RGBA values. + */ +LLVMValueRef +lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + struct lp_type type, + boolean aligned, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j, + LLVMValueRef cache) +{ + const struct util_format_unpack_description *unpack = + util_format_unpack_description(format_desc->format); + LLVMBuilderRef builder = gallivm->builder; + unsigned num_pixels = type.length / 4; + struct lp_build_context bld; + + assert(type.length <= LP_MAX_VECTOR_LENGTH); + assert(type.length % 4 == 0); + + lp_build_context_init(&bld, gallivm, type); + + /* + * Trivial case + * + * The format matches the type (apart of a swizzle) so no need for + * scaling or converting. + */ + + if (format_matches_type(format_desc, type) && + format_desc->block.bits <= type.width * 4 && + /* XXX this shouldn't be needed */ + util_is_power_of_two_or_zero(format_desc->block.bits)) { + LLVMValueRef packed; + LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type); + struct lp_type fetch_type; + unsigned vec_len = type.width * type.length; + + /* + * The format matches the type (apart of a swizzle) so no need for + * scaling or converting. + */ + + fetch_type = lp_type_uint(type.width*4); + packed = lp_build_gather(gallivm, type.length/4, + format_desc->block.bits, fetch_type, + aligned, base_ptr, offset, TRUE); + + assert(format_desc->block.bits <= vec_len); + (void) vec_len; /* silence unused var warning for non-debug build */ + + packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, ""); + return lp_build_format_swizzle_aos(format_desc, &bld, packed); + } + + /* + * Bit arithmetic for converting small_unorm to unorm8. + * + * This misses some opportunities for optimizations (like skipping mask + * for the highest channel for instance, or doing bit scaling in parallel + * for channels with the same bit width) but it should be passable for + * all arithmetic formats. + */ + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + util_format_fits_8unorm(format_desc) && + type.width == 8 && type.norm == 1 && type.sign == 0 && + type.fixed == 0 && type.floating == 0) { + LLVMValueRef packed, res = NULL, chans[4], rgba[4]; + LLVMTypeRef dst_vec_type, conv_vec_type; + struct lp_type fetch_type, conv_type; + struct lp_build_context bld_conv; + unsigned j; + + fetch_type = lp_type_uint(type.width*4); + conv_type = lp_type_int_vec(type.width*4, type.width * type.length); + dst_vec_type = lp_build_vec_type(gallivm, type); + conv_vec_type = lp_build_vec_type(gallivm, conv_type); + lp_build_context_init(&bld_conv, gallivm, conv_type); + + packed = lp_build_gather(gallivm, type.length/4, + format_desc->block.bits, fetch_type, + aligned, base_ptr, offset, TRUE); + + assert(format_desc->block.bits * type.length / 4 <= + type.width * type.length); + + packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, ""); + + for (j = 0; j < format_desc->nr_channels; ++j) { + unsigned mask = 0; + unsigned sa = format_desc->channel[j].shift; + + mask = (1 << format_desc->channel[j].size) - 1; + + /* Extract bits from source */ + chans[j] = LLVMBuildLShr(builder, packed, + lp_build_const_int_vec(gallivm, conv_type, sa), + ""); + + chans[j] = LLVMBuildAnd(builder, chans[j], + lp_build_const_int_vec(gallivm, conv_type, mask), + ""); + + /* Scale bits */ + if (type.norm) { + chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size, + type.width, chans[j], conv_type); + } + } + /* + * This is a hacked lp_build_format_swizzle_soa() since we need a + * normalized 1 but only 8 bits in a 32bit vector... + */ + for (j = 0; j < 4; ++j) { + enum pipe_swizzle swizzle = format_desc->swizzle[j]; + if (swizzle == PIPE_SWIZZLE_1) { + rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1); + } else { + rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle); + } + if (j == 0) { + res = rgba[j]; + } else { + rgba[j] = LLVMBuildShl(builder, rgba[j], + lp_build_const_int_vec(gallivm, conv_type, + j * type.width), ""); + res = LLVMBuildOr(builder, res, rgba[j], ""); + } + } + res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, ""); + + return res; + } + + /* + * Bit arithmetic + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && + format_desc->block.width == 1 && + format_desc->block.height == 1 && + /* XXX this shouldn't be needed */ + util_is_power_of_two_or_zero(format_desc->block.bits) && + format_desc->block.bits <= 32 && + format_desc->is_bitmask && + !format_desc->is_mixed && + (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED || + format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) && + !format_desc->channel[0].pure_integer) { + + LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128]; + struct lp_type conv_type; + unsigned k, num_conv_src, num_conv_dst; + + /* + * Note this path is generally terrible for fetching multiple pixels. + * We should make sure we cannot hit this code path for anything but + * single pixels. + */ + + /* + * Unpack a pixel at a time into a <4 x float> RGBA vector + */ + + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef packed; + + packed = lp_build_gather_elem(gallivm, num_pixels, + format_desc->block.bits, 32, aligned, + base_ptr, offset, k, FALSE); + + tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm, + format_desc, + packed); + } + + /* + * Type conversion. + * + * TODO: We could avoid floating conversion for integer to + * integer conversions. + */ + + if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) { + debug_printf("%s: unpacking %s with floating point\n", + __FUNCTION__, format_desc->short_name); + } + + conv_type = lp_float32_vec4_type(); + num_conv_src = num_pixels; + num_conv_dst = 1; + + if (num_pixels % 8 == 0) { + lp_build_concat_n(gallivm, lp_float32_vec4_type(), + tmps, num_pixels, tmps, num_pixels / 2); + conv_type.length *= num_pixels / 4; + num_conv_src = 4 * num_pixels / 8; + if (type.width == 8 && type.floating == 0 && type.fixed == 0) { + /* + * FIXME: The fast float->unorm path (which is basically + * skipping the MIN/MAX which are extremely pointless in any + * case) requires that there's 2 destinations... + * In any case, we really should make sure we don't hit this + * code with multiple pixels for unorm8 dst types, it's + * completely hopeless even if we do hit the right conversion. + */ + type.length /= num_pixels / 4; + num_conv_dst = num_pixels / 4; + } + } + + lp_build_conv(gallivm, conv_type, type, + tmps, num_conv_src, res, num_conv_dst); + + if (num_pixels % 8 == 0 && + (type.width == 8 && type.floating == 0 && type.fixed == 0)) { + lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1); + } + + return lp_build_format_swizzle_aos(format_desc, &bld, res[0]); + } + + /* If all channels are of same type and we are not using half-floats */ + if (format_desc->is_array && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) { + assert(!format_desc->is_mixed); + return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset); + } + + /* + * YUV / subsampled formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_subsampled_rgba_aos(gallivm, + format_desc, + num_pixels, + base_ptr, + offset, + i, j); + + lp_build_conv(gallivm, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + + /* + * s3tc rgb formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_s3tc_rgba_aos(gallivm, + format_desc, + num_pixels, + base_ptr, + offset, + i, j, + cache); + + lp_build_conv(gallivm, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + + /* + * rgtc rgb formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + tmp_type.sign = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM || + format_desc->format == PIPE_FORMAT_RGTC2_SNORM || + format_desc->format == PIPE_FORMAT_LATC1_SNORM || + format_desc->format == PIPE_FORMAT_LATC2_SNORM); + + tmp = lp_build_fetch_rgtc_rgba_aos(gallivm, + format_desc, + num_pixels, + base_ptr, + offset, + i, j, + cache); + + lp_build_conv(gallivm, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + + /* + * Fallback to util_format_description::fetch_rgba_8unorm(). + */ + + if (unpack->fetch_rgba_8unorm && + !type.floating && type.width == 8 && !type.sign && type.norm) { + /* + * Fallback to calling util_format_description::fetch_rgba_8unorm. + * + * This is definitely not the most efficient way of fetching pixels, as + * we miss the opportunity to do vectorization, but this it is a + * convenient for formats or scenarios for which there was no opportunity + * or incentive to optimize. + */ + + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef function; + LLVMValueRef tmp_ptr; + LLVMValueRef tmp; + LLVMValueRef res; + unsigned k; + + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n", + __FUNCTION__, format_desc->short_name); + } + + /* + * Declare and bind format_desc->fetch_rgba_8unorm(). + */ + + { + /* + * Function to call looks like: + * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) + */ + LLVMTypeRef ret_type; + LLVMTypeRef arg_types[4]; + LLVMTypeRef function_type; + + ret_type = LLVMVoidTypeInContext(gallivm->context); + arg_types[0] = pi8t; + arg_types[1] = pi8t; + arg_types[2] = i32t; + arg_types[3] = i32t; + function_type = LLVMFunctionType(ret_type, arg_types, + ARRAY_SIZE(arg_types), 0); + + if (gallivm->cache) + gallivm->cache->dont_cache = true; + /* make const pointer for the C fetch_rgba_8unorm function */ + function = lp_build_const_int_pointer(gallivm, + func_to_pointer((func_pointer) unpack->fetch_rgba_8unorm)); + + /* cast the callee pointer to the function's type */ + function = LLVMBuildBitCast(builder, function, + LLVMPointerType(function_type, 0), + "cast callee"); + } + + tmp_ptr = lp_build_alloca(gallivm, i32t, ""); + + res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels)); + + /* + * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result + * in the SoA vectors. + */ + + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef index = lp_build_const_int32(gallivm, k); + LLVMValueRef args[4]; + + args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); + args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, + base_ptr, offset, k); + + if (num_pixels == 1) { + args[2] = i; + args[3] = j; + } + else { + args[2] = LLVMBuildExtractElement(builder, i, index, ""); + args[3] = LLVMBuildExtractElement(builder, j, index, ""); + } + + LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); + + tmp = LLVMBuildLoad(builder, tmp_ptr, ""); + + if (num_pixels == 1) { + res = tmp; + } + else { + res = LLVMBuildInsertElement(builder, res, tmp, index, ""); + } + } + + /* Bitcast from to <4n x i8> */ + res = LLVMBuildBitCast(builder, res, bld.vec_type, ""); + + return res; + } + + /* + * Fallback to util_format_description::fetch_rgba_float(). + */ + + if (unpack->fetch_rgba) { + /* + * Fallback to calling util_format_description::fetch_rgba_float. + * + * This is definitely not the most efficient way of fetching pixels, as + * we miss the opportunity to do vectorization, but this it is a + * convenient for formats or scenarios for which there was no opportunity + * or incentive to optimize. + */ + + LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); + LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4); + LLVMTypeRef pf32t = LLVMPointerType(f32t, 0); + LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMValueRef function; + LLVMValueRef tmp_ptr; + LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef res; + unsigned k; + + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n", + __FUNCTION__, format_desc->short_name); + } + + /* + * Declare and bind unpack->fetch_rgba_float(). + */ + + { + /* + * Function to call looks like: + * fetch(float *dst, const uint8_t *src, unsigned i, unsigned j) + */ + LLVMTypeRef ret_type; + LLVMTypeRef arg_types[4]; + + ret_type = LLVMVoidTypeInContext(gallivm->context); + arg_types[0] = pf32t; + arg_types[1] = pi8t; + arg_types[2] = i32t; + arg_types[3] = i32t; + + if (gallivm->cache) + gallivm->cache->dont_cache = true; + function = lp_build_const_func_pointer(gallivm, + func_to_pointer((func_pointer) unpack->fetch_rgba), + ret_type, + arg_types, ARRAY_SIZE(arg_types), + format_desc->short_name); + } + + tmp_ptr = lp_build_alloca(gallivm, f32x4t, ""); + + /* + * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result + * in the SoA vectors. + */ + + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef args[4]; + + args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, ""); + args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, + base_ptr, offset, k); + + if (num_pixels == 1) { + args[2] = i; + args[3] = j; + } + else { + LLVMValueRef index = lp_build_const_int32(gallivm, k); + args[2] = LLVMBuildExtractElement(builder, i, index, ""); + args[3] = LLVMBuildExtractElement(builder, j, index, ""); + } + + LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); + + tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); + } + + lp_build_conv(gallivm, + lp_float32_vec4_type(), + type, + tmps, num_pixels, &res, 1); + + return res; + } + + assert(!util_format_is_pure_integer(format_desc->format)); + + assert(0); + return lp_build_undef(gallivm, type); +}