X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_format_aos.c;h=2f723857f4b7d2916b9a52626d41a776342de5a5;hb=48f4baf63f5a50503907e8b0ea475fd1465c0b1d;hp=92123e09d3289c5d00a706749928c00d7fc24994;hpb=89b2897220acfacdc431f138377fbcec9f0ea812;p=mesa.git diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 92123e09d32..2f723857f4b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -36,7 +36,9 @@ #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_pointer.h" #include "util/u_string.h" +#include "util/u_cpu_detect.h" #include "lp_bld_arit.h" #include "lp_bld_init.h" @@ -46,12 +48,17 @@ #include "lp_bld_conv.h" #include "lp_bld_swizzle.h" #include "lp_bld_gather.h" +#include "lp_bld_debug.h" #include "lp_bld_format.h" +#include "lp_bld_pack.h" +#include "lp_bld_intr.h" +#include "lp_bld_logic.h" +#include "lp_bld_bitarit.h" /** * Basic swizzling. Rearrange the order of the unswizzled array elements - * according to the format description. PIPE_SWIZZLE_ZERO/ONE are supported + * according to the format description. PIPE_SWIZZLE_0/ONE are supported * too. * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}. */ @@ -66,16 +73,16 @@ lp_build_format_swizzle_aos(const struct util_format_description *desc, assert(bld->type.length % 4 == 0); for (chan = 0; chan < 4; ++chan) { - enum util_format_swizzle swizzle; + enum pipe_swizzle swizzle; if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { /* * For ZS formats do RGBA = ZZZ1 */ if (chan == 3) { - swizzle = UTIL_FORMAT_SWIZZLE_1; - } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) { - swizzle = UTIL_FORMAT_SWIZZLE_0; + swizzle = PIPE_SWIZZLE_1; + } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) { + swizzle = PIPE_SWIZZLE_0; } else { swizzle = desc->swizzle[0]; } @@ -92,7 +99,7 @@ lp_build_format_swizzle_aos(const struct util_format_description *desc, /** * Whether the format matches the vector type, apart of swizzles. */ -static INLINE boolean +static inline boolean format_matches_type(const struct util_format_description *desc, struct lp_type type) { @@ -134,28 +141,96 @@ format_matches_type(const struct util_format_description *desc, return TRUE; } +/* + * Do rounding when converting small unorm values to larger ones. + * Not quite 100% accurate, as it's done by appending MSBs, but + * should be good enough. + */ + +static inline LLVMValueRef +scale_bits_up(struct gallivm_state *gallivm, + int src_bits, + int dst_bits, + LLVMValueRef src, + struct lp_type src_type) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result = src; + + if (src_bits == 1 && dst_bits > 1) { + /* + * Useful for a1 - we'd need quite some repeated copies otherwise. + */ + struct lp_build_context bld; + LLVMValueRef dst_mask; + lp_build_context_init(&bld, gallivm, src_type); + dst_mask = lp_build_const_int_vec(gallivm, src_type, + (1 << dst_bits) - 1), + result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src, + lp_build_const_int_vec(gallivm, src_type, 0)); + result = lp_build_andnot(&bld, dst_mask, result); + } + else if (dst_bits > src_bits) { + /* Scale up bits */ + int db = dst_bits - src_bits; + + /* Shift left by difference in bits */ + result = LLVMBuildShl(builder, + src, + lp_build_const_int_vec(gallivm, src_type, db), + ""); + + if (db <= src_bits) { + /* Enough bits in src to fill the remainder */ + LLVMValueRef lower = LLVMBuildLShr(builder, + src, + lp_build_const_int_vec(gallivm, src_type, + src_bits - db), + ""); + + result = LLVMBuildOr(builder, result, lower, ""); + } else if (db > src_bits) { + /* Need to repeatedly copy src bits to fill remainder in dst */ + unsigned n; + + for (n = src_bits; n < dst_bits; n *= 2) { + LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); + + result = LLVMBuildOr(builder, + result, + LLVMBuildLShr(builder, result, shuv, ""), + ""); + } + } + } else { + assert (dst_bits == src_bits); + } + + return result; +} /** - * Unpack a single pixel into its RGBA components. + * Unpack a single pixel into its XYZW components. * * @param desc the pixel format for the packed pixel value * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM * - * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector. + * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector. */ -static INLINE LLVMValueRef -lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder, +static inline LLVMValueRef +lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *desc, LLVMValueRef packed) { + LLVMBuilderRef builder = gallivm->builder; LLVMValueRef shifted, casted, scaled, masked; LLVMValueRef shifts[4]; LLVMValueRef masks[4]; LLVMValueRef scales[4]; + LLVMTypeRef vec32_type; boolean normalized; boolean needs_uitofp; - unsigned shift; unsigned i; /* TODO: Support more formats */ @@ -166,36 +241,34 @@ lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder, /* Do the intermediate integer computations with 32bit integers since it * matches floating point size */ - assert (LLVMTypeOf(packed) == LLVMInt32Type()); + assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context)); + + vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); /* Broadcast the packed value to all four channels * before: packed = BGRA * after: packed = {BGRA, BGRA, BGRA, BGRA} */ - packed = LLVMBuildInsertElement(builder, - LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)), - packed, - LLVMConstNull(LLVMInt32Type()), + packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed, + LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)), ""); - packed = LLVMBuildShuffleVector(builder, - packed, - LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)), - LLVMConstNull(LLVMVectorType(LLVMInt32Type(), 4)), + packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type), + LLVMConstNull(vec32_type), ""); /* Initialize vector constants */ normalized = FALSE; needs_uitofp = FALSE; - shift = 0; /* Loop over 4 color components */ for (i = 0; i < 4; ++i) { unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { - shifts[i] = LLVMGetUndef(LLVMInt32Type()); - masks[i] = LLVMConstNull(LLVMInt32Type()); - scales[i] = LLVMConstNull(LLVMFloatType()); + shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + masks[i] = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)); + scales[i] = LLVMConstNull(LLVMFloatTypeInContext(gallivm->context)); } else { unsigned long long mask = (1ULL << bits) - 1; @@ -206,36 +279,67 @@ lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder, needs_uitofp = TRUE; } - shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0); - masks[i] = LLVMConstInt(LLVMInt32Type(), mask, 0); + shifts[i] = lp_build_const_int32(gallivm, shift); + masks[i] = lp_build_const_int32(gallivm, mask); if (desc->channel[i].normalized) { - scales[i] = LLVMConstReal(LLVMFloatType(), 1.0/mask); + scales[i] = lp_build_const_float(gallivm, 1.0 / mask); normalized = TRUE; } else - scales[i] = LLVMConstReal(LLVMFloatType(), 1.0); + scales[i] = lp_build_const_float(gallivm, 1.0); } - - shift += bits; } - /* Ex: convert packed = {BGRA, BGRA, BGRA, BGRA} - * into masked = {B, G, R, A} + /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW} + * into masked = {X, Y, Z, W} */ - shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); - masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); - + if (desc->block.bits < 32 && normalized) { + /* + * Note: we cannot do the shift below on x86 natively until AVX2. + * + * Old llvm versions will resort to scalar extract/shift insert, + * which is definitely terrible, new versions will just do + * several vector shifts and shuffle/blend results together. + * We could turn this into a variable left shift plus a constant + * right shift, and llvm would then turn the variable left shift + * into a mul for us (albeit without sse41 the mul needs emulation + * too...). However, since we're going to do a float mul + * anyway, we just adjust that mul instead (plus the mask), skipping + * the shift completely. + * We could also use a extra mul when the format isn't normalized and + * we don't have AVX2 support, but don't bother for now. Unfortunately, + * this strategy doesn't work for 32bit formats (such as rgb10a2 or even + * rgba8 if it ends up here), as that would require UIToFP, albeit that + * would be fixable with easy 16bit shuffle (unless there's channels + * crossing 16bit boundaries). + */ + for (i = 0; i < 4; ++i) { + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { + unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; + unsigned long long mask = ((1ULL << bits) - 1) << shift; + scales[i] = lp_build_const_float(gallivm, 1.0 / mask); + masks[i] = lp_build_const_int32(gallivm, mask); + } + } + masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), ""); + } else { + shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); + masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + } if (!needs_uitofp) { /* UIToFP can't be expressed in SSE2 */ - casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), ""); + casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); } else { - casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), ""); + casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); } - /* At this point 'casted' may be a vector of floats such as - * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized + /* + * At this point 'casted' may be a vector of floats such as + * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied + * by powers of two). Next, if the pixel values are normalized * we'll scale this to {1.0, 1.0, 1.0, 1.0}. */ @@ -257,10 +361,11 @@ lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder, * a time is rarely if ever needed. */ LLVMValueRef -lp_build_pack_rgba_aos(LLVMBuilderRef builder, +lp_build_pack_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *desc, LLVMValueRef rgba) { + LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef type; LLVMValueRef packed = NULL; LLVMValueRef swizzles[4]; @@ -268,14 +373,13 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, LLVMValueRef shifts[4]; LLVMValueRef scales[4]; boolean normalized; - unsigned shift; unsigned i, j; assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); assert(desc->block.width == 1); assert(desc->block.height == 1); - type = LLVMIntType(desc->block.bits); + type = LLVMIntTypeInContext(gallivm->context, desc->block.bits); /* Unswizzle the color components into the source vector. */ for (i = 0; i < 4; ++i) { @@ -284,23 +388,23 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, break; } if (j < 4) - swizzles[i] = LLVMConstInt(LLVMInt32Type(), j, 0); + swizzles[i] = lp_build_const_int32(gallivm, j); else - swizzles[i] = LLVMGetUndef(LLVMInt32Type()); + swizzles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); } unswizzled = LLVMBuildShuffleVector(builder, rgba, - LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)), + LLVMGetUndef(LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4)), LLVMConstVector(swizzles, 4), ""); normalized = FALSE; - shift = 0; for (i = 0; i < 4; ++i) { unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { - shifts[i] = LLVMGetUndef(LLVMInt32Type()); - scales[i] = LLVMGetUndef(LLVMFloatType()); + shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); + scales[i] = LLVMGetUndef(LLVMFloatTypeInContext(gallivm->context)); } else { unsigned mask = (1 << bits) - 1; @@ -308,17 +412,15 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(bits < 32); - shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0); + shifts[i] = lp_build_const_int32(gallivm, shift); if (desc->channel[i].normalized) { - scales[i] = LLVMConstReal(LLVMFloatType(), mask); + scales[i] = lp_build_const_float(gallivm, mask); normalized = TRUE; } else - scales[i] = LLVMConstReal(LLVMFloatType(), 1.0); + scales[i] = lp_build_const_float(gallivm, 1.0); } - - shift += bits; } if (normalized) @@ -326,14 +428,15 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, else scaled = unswizzled; - casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32Type(), 4), ""); + casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), ""); shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), ""); /* Bitwise or all components */ for (i = 0; i < 4; ++i) { if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) { - LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, LLVMConstInt(LLVMInt32Type(), i, 0), ""); + LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, + lp_build_const_int32(gallivm, i), ""); if (packed) packed = LLVMBuildOr(builder, packed, component, ""); else @@ -342,7 +445,7 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, } if (!packed) - packed = LLVMGetUndef(LLVMInt32Type()); + packed = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); if (desc->block.bits < 32) packed = LLVMBuildTrunc(builder, packed, type, ""); @@ -357,27 +460,31 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, * Fetch a pixel into a 4 float AoS. * * \param format_desc describes format of the image we're fetching from + * \param aligned whether the data is guaranteed to be aligned * \param ptr address of the pixel block (or the texel if uncompressed) * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0, 0). * \return a 4 element vector with the pixel's RGBA values. */ LLVMValueRef -lp_build_fetch_rgba_aos(LLVMBuilderRef builder, +lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, + boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + LLVMValueRef cache) { + LLVMBuilderRef builder = gallivm->builder; unsigned num_pixels = type.length / 4; struct lp_build_context bld; assert(type.length <= LP_MAX_VECTOR_LENGTH); assert(type.length % 4 == 0); - lp_build_context_init(&bld, builder, type); + lp_build_context_init(&bld, gallivm, type); /* * Trivial case @@ -388,25 +495,110 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; + LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type); + struct lp_type fetch_type; + unsigned vec_len = type.width * type.length; /* * The format matches the type (apart of a swizzle) so no need for * scaling or converting. */ - packed = lp_build_gather(builder, type.length/4, - format_desc->block.bits, type.width*4, - base_ptr, offset); - - assert(format_desc->block.bits <= type.width * type.length); + fetch_type = lp_type_uint(type.width*4); + packed = lp_build_gather(gallivm, type.length/4, + format_desc->block.bits, fetch_type, + aligned, base_ptr, offset, TRUE); - packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(type), ""); + assert(format_desc->block.bits <= vec_len); + (void) vec_len; /* silence unused var warning for non-debug build */ + packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, ""); return lp_build_format_swizzle_aos(format_desc, &bld, packed); } + /* + * Bit arithmetic for converting small_unorm to unorm8. + * + * This misses some opportunities for optimizations (like skipping mask + * for the highest channel for instance, or doing bit scaling in parallel + * for channels with the same bit width) but it should be passable for + * all arithmetic formats. + */ + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + util_format_fits_8unorm(format_desc) && + type.width == 8 && type.norm == 1 && type.sign == 0 && + type.fixed == 0 && type.floating == 0) { + LLVMValueRef packed, res, chans[4], rgba[4]; + LLVMTypeRef dst_vec_type, conv_vec_type; + struct lp_type fetch_type, conv_type; + struct lp_build_context bld_conv; + unsigned j; + + fetch_type = lp_type_uint(type.width*4); + conv_type = lp_type_int_vec(type.width*4, type.width * type.length); + dst_vec_type = lp_build_vec_type(gallivm, type); + conv_vec_type = lp_build_vec_type(gallivm, conv_type); + lp_build_context_init(&bld_conv, gallivm, conv_type); + + packed = lp_build_gather(gallivm, type.length/4, + format_desc->block.bits, fetch_type, + aligned, base_ptr, offset, TRUE); + + assert(format_desc->block.bits * type.length / 4 <= + type.width * type.length); + + packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, ""); + + for (j = 0; j < format_desc->nr_channels; ++j) { + unsigned mask = 0; + unsigned sa = format_desc->channel[j].shift; + + mask = (1 << format_desc->channel[j].size) - 1; + + /* Extract bits from source */ + chans[j] = LLVMBuildLShr(builder, packed, + lp_build_const_int_vec(gallivm, conv_type, sa), + ""); + + chans[j] = LLVMBuildAnd(builder, chans[j], + lp_build_const_int_vec(gallivm, conv_type, mask), + ""); + + /* Scale bits */ + if (type.norm) { + chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size, + type.width, chans[j], conv_type); + } + } + /* + * This is a hacked lp_build_format_swizzle_soa() since we need a + * normalized 1 but only 8 bits in a 32bit vector... + */ + for (j = 0; j < 4; ++j) { + enum pipe_swizzle swizzle = format_desc->swizzle[j]; + if (swizzle == PIPE_SWIZZLE_1) { + rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1); + } else { + rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle); + } + if (j == 0) { + res = rgba[j]; + } else { + rgba[j] = LLVMBuildShl(builder, rgba[j], + lp_build_const_int_vec(gallivm, conv_type, + j * type.width), ""); + res = LLVMBuildOr(builder, res, rgba[j], ""); + } + } + res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, ""); + + return res; + } + /* * Bit arithmetic */ @@ -416,16 +608,25 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && !format_desc->is_mixed && (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED || - format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) { + format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) && + !format_desc->channel[0].pure_integer) { LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; - LLVMValueRef res; - unsigned k; + LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128]; + struct lp_type conv_type; + unsigned k, num_conv_src, num_conv_dst; + + /* + * Note this path is generally terrible for fetching multiple pixels. + * We should make sure we cannot hit this code path for anything but + * single pixels. + */ /* * Unpack a pixel at a time into a <4 x float> RGBA vector @@ -434,11 +635,12 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, for (k = 0; k < num_pixels; ++k) { LLVMValueRef packed; - packed = lp_build_gather_elem(builder, num_pixels, - format_desc->block.bits, 32, - base_ptr, offset, k); + packed = lp_build_gather_elem(gallivm, num_pixels, + format_desc->block.bits, 32, aligned, + base_ptr, offset, k, FALSE); - tmps[k] = lp_build_unpack_arith_rgba_aos(builder, format_desc, + tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm, + format_desc, packed); } @@ -449,12 +651,50 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, * integer conversions. */ - lp_build_conv(builder, - lp_float32_vec4_type(), - type, - tmps, num_pixels, &res, 1); + if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) { + debug_printf("%s: unpacking %s with floating point\n", + __FUNCTION__, format_desc->short_name); + } + + conv_type = lp_float32_vec4_type(); + num_conv_src = num_pixels; + num_conv_dst = 1; + + if (num_pixels % 8 == 0) { + lp_build_concat_n(gallivm, lp_float32_vec4_type(), + tmps, num_pixels, tmps, num_pixels / 2); + conv_type.length *= num_pixels / 4; + num_conv_src = 4 * num_pixels / 8; + if (type.width == 8 && type.floating == 0 && type.fixed == 0) { + /* + * FIXME: The fast float->unorm path (which is basically + * skipping the MIN/MAX which are extremely pointless in any + * case) requires that there's 2 destinations... + * In any case, we really should make sure we don't hit this + * code with multiple pixels for unorm8 dst types, it's + * completely hopeless even if we do hit the right conversion. + */ + type.length /= num_pixels / 4; + num_conv_dst = num_pixels / 4; + } + } + + lp_build_conv(gallivm, conv_type, type, + tmps, num_conv_src, res, num_conv_dst); + + if (num_pixels % 8 == 0 && + (type.width == 8 && type.floating == 0 && type.fixed == 0)) { + lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1); + } - return lp_build_format_swizzle_aos(format_desc, &bld, res); + return lp_build_format_swizzle_aos(format_desc, &bld, res[0]); + } + + /* If all channels are of same type and we are not using half-floats */ + if (format_desc->is_array && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) { + assert(!format_desc->is_mixed); + return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset); } /* @@ -470,20 +710,48 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, tmp_type.length = num_pixels * 4; tmp_type.norm = TRUE; - tmp = lp_build_fetch_subsampled_rgba_aos(builder, + tmp = lp_build_fetch_subsampled_rgba_aos(gallivm, format_desc, num_pixels, base_ptr, offset, i, j); - lp_build_conv(builder, + lp_build_conv(gallivm, tmp_type, type, &tmp, 1, &tmp, 1); return tmp; } + /* + * s3tc rgb formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_cached_texels(gallivm, + format_desc, + num_pixels, + base_ptr, + offset, + i, j, + cache); + + lp_build_conv(gallivm, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + /* * Fallback to util_format_description::fetch_rgba_8unorm(). */ @@ -499,47 +767,52 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, * or incentive to optimize. */ - LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); - char name[256]; - LLVMTypeRef i8t = LLVMInt8Type(); + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); - LLVMTypeRef i32t = LLVMInt32Type(); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef function; LLVMValueRef tmp_ptr; LLVMValueRef tmp; LLVMValueRef res; unsigned k; - util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm", - format_desc->short_name); + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n", + __FUNCTION__, format_desc->short_name); + } /* * Declare and bind format_desc->fetch_rgba_8unorm(). */ - function = LLVMGetNamedFunction(module, name); - if (!function) { + { + /* + * Function to call looks like: + * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) + */ LLVMTypeRef ret_type; LLVMTypeRef arg_types[4]; LLVMTypeRef function_type; - ret_type = LLVMVoidType(); + ret_type = LLVMVoidTypeInContext(gallivm->context); arg_types[0] = pi8t; arg_types[1] = pi8t; - arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8); - function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); - function = LLVMAddFunction(module, name, function_type); - - LLVMSetFunctionCallConv(function, LLVMCCallConv); - LLVMSetLinkage(function, LLVMExternalLinkage); - - assert(LLVMIsDeclaration(function)); - - LLVMAddGlobalMapping(lp_build_engine, function, - func_to_pointer((func_pointer)format_desc->fetch_rgba_8unorm)); + arg_types[2] = i32t; + arg_types[3] = i32t; + function_type = LLVMFunctionType(ret_type, arg_types, + ARRAY_SIZE(arg_types), 0); + + /* make const pointer for the C fetch_rgba_8unorm function */ + function = lp_build_const_int_pointer(gallivm, + func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm)); + + /* cast the callee pointer to the function's type */ + function = LLVMBuildBitCast(builder, function, + LLVMPointerType(function_type, 0), + "cast callee"); } - tmp_ptr = lp_build_alloca(builder, i32t, ""); + tmp_ptr = lp_build_alloca(gallivm, i32t, ""); res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels)); @@ -549,11 +822,11 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, */ for (k = 0; k < num_pixels; ++k) { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); + LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef args[4]; args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); - args[1] = lp_build_gather_elem_ptr(builder, num_pixels, + args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, base_ptr, offset, k); if (num_pixels == 1) { @@ -565,7 +838,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, args[3] = LLVMBuildExtractElement(builder, j, index, ""); } - LLVMBuildCall(builder, function, args, Elements(args), ""); + LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); tmp = LLVMBuildLoad(builder, tmp_ptr, ""); @@ -583,7 +856,6 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, return res; } - /* * Fallback to util_format_description::fetch_rgba_float(). */ @@ -598,47 +870,48 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, * or incentive to optimize. */ - LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); - char name[256]; - LLVMTypeRef f32t = LLVMFloatType(); + LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context); LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4); LLVMTypeRef pf32t = LLVMPointerType(f32t, 0); + LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef function; LLVMValueRef tmp_ptr; LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; LLVMValueRef res; unsigned k; - util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", - format_desc->short_name); + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n", + __FUNCTION__, format_desc->short_name); + } /* * Declare and bind format_desc->fetch_rgba_float(). */ - function = LLVMGetNamedFunction(module, name); - if (!function) { + { + /* + * Function to call looks like: + * fetch(float *dst, const uint8_t *src, unsigned i, unsigned j) + */ LLVMTypeRef ret_type; LLVMTypeRef arg_types[4]; - LLVMTypeRef function_type; - ret_type = LLVMVoidType(); + ret_type = LLVMVoidTypeInContext(gallivm->context); arg_types[0] = pf32t; - arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0); - arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8); - function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); - function = LLVMAddFunction(module, name, function_type); - - LLVMSetFunctionCallConv(function, LLVMCCallConv); - LLVMSetLinkage(function, LLVMExternalLinkage); - - assert(LLVMIsDeclaration(function)); - - LLVMAddGlobalMapping(lp_build_engine, function, - func_to_pointer((func_pointer)format_desc->fetch_rgba_float)); + arg_types[1] = pi8t; + arg_types[2] = i32t; + arg_types[3] = i32t; + + function = lp_build_const_func_pointer(gallivm, + func_to_pointer((func_pointer) format_desc->fetch_rgba_float), + ret_type, + arg_types, ARRAY_SIZE(arg_types), + format_desc->short_name); } - tmp_ptr = lp_build_alloca(builder, f32x4t, ""); + tmp_ptr = lp_build_alloca(gallivm, f32x4t, ""); /* * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result @@ -649,7 +922,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, LLVMValueRef args[4]; args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, ""); - args[1] = lp_build_gather_elem_ptr(builder, num_pixels, + args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels, base_ptr, offset, k); if (num_pixels == 1) { @@ -657,17 +930,17 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, args[3] = j; } else { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); + LLVMValueRef index = lp_build_const_int32(gallivm, k); args[2] = LLVMBuildExtractElement(builder, i, index, ""); args[3] = LLVMBuildExtractElement(builder, j, index, ""); } - LLVMBuildCall(builder, function, args, Elements(args), ""); + LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); } - lp_build_conv(builder, + lp_build_conv(gallivm, lp_float32_vec4_type(), type, tmps, num_pixels, &res, 1); @@ -675,6 +948,8 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, return res; } + assert(!util_format_is_pure_integer(format_desc->format)); + assert(0); - return lp_build_undef(type); + return lp_build_undef(gallivm, type); }