X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_format_aos.c;h=2f723857f4b7d2916b9a52626d41a776342de5a5;hb=48f4baf63f5a50503907e8b0ea475fd1465c0b1d;hp=e5c93f0c91852b943eb4fba035207ee900c0881b;hpb=e7bbd9c243c26c1e9da2c86d332e4f91474abac5;p=mesa.git diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index e5c93f0c918..2f723857f4b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -38,6 +38,7 @@ #include "util/u_math.h" #include "util/u_pointer.h" #include "util/u_string.h" +#include "util/u_cpu_detect.h" #include "lp_bld_arit.h" #include "lp_bld_init.h" @@ -49,12 +50,15 @@ #include "lp_bld_gather.h" #include "lp_bld_debug.h" #include "lp_bld_format.h" +#include "lp_bld_pack.h" #include "lp_bld_intr.h" +#include "lp_bld_logic.h" +#include "lp_bld_bitarit.h" /** * Basic swizzling. Rearrange the order of the unswizzled array elements - * according to the format description. PIPE_SWIZZLE_ZERO/ONE are supported + * according to the format description. PIPE_SWIZZLE_0/ONE are supported * too. * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}. */ @@ -69,16 +73,16 @@ lp_build_format_swizzle_aos(const struct util_format_description *desc, assert(bld->type.length % 4 == 0); for (chan = 0; chan < 4; ++chan) { - enum util_format_swizzle swizzle; + enum pipe_swizzle swizzle; if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { /* * For ZS formats do RGBA = ZZZ1 */ if (chan == 3) { - swizzle = UTIL_FORMAT_SWIZZLE_1; - } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) { - swizzle = UTIL_FORMAT_SWIZZLE_0; + swizzle = PIPE_SWIZZLE_1; + } else if (desc->swizzle[0] == PIPE_SWIZZLE_NONE) { + swizzle = PIPE_SWIZZLE_0; } else { swizzle = desc->swizzle[0]; } @@ -95,7 +99,7 @@ lp_build_format_swizzle_aos(const struct util_format_description *desc, /** * Whether the format matches the vector type, apart of swizzles. */ -static INLINE boolean +static inline boolean format_matches_type(const struct util_format_description *desc, struct lp_type type) { @@ -137,16 +141,83 @@ format_matches_type(const struct util_format_description *desc, return TRUE; } +/* + * Do rounding when converting small unorm values to larger ones. + * Not quite 100% accurate, as it's done by appending MSBs, but + * should be good enough. + */ + +static inline LLVMValueRef +scale_bits_up(struct gallivm_state *gallivm, + int src_bits, + int dst_bits, + LLVMValueRef src, + struct lp_type src_type) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result = src; + + if (src_bits == 1 && dst_bits > 1) { + /* + * Useful for a1 - we'd need quite some repeated copies otherwise. + */ + struct lp_build_context bld; + LLVMValueRef dst_mask; + lp_build_context_init(&bld, gallivm, src_type); + dst_mask = lp_build_const_int_vec(gallivm, src_type, + (1 << dst_bits) - 1), + result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src, + lp_build_const_int_vec(gallivm, src_type, 0)); + result = lp_build_andnot(&bld, dst_mask, result); + } + else if (dst_bits > src_bits) { + /* Scale up bits */ + int db = dst_bits - src_bits; + + /* Shift left by difference in bits */ + result = LLVMBuildShl(builder, + src, + lp_build_const_int_vec(gallivm, src_type, db), + ""); + + if (db <= src_bits) { + /* Enough bits in src to fill the remainder */ + LLVMValueRef lower = LLVMBuildLShr(builder, + src, + lp_build_const_int_vec(gallivm, src_type, + src_bits - db), + ""); + + result = LLVMBuildOr(builder, result, lower, ""); + } else if (db > src_bits) { + /* Need to repeatedly copy src bits to fill remainder in dst */ + unsigned n; + + for (n = src_bits; n < dst_bits; n *= 2) { + LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n); + + result = LLVMBuildOr(builder, + result, + LLVMBuildLShr(builder, result, shuv, ""), + ""); + } + } + } else { + assert (dst_bits == src_bits); + } + + return result; +} /** - * Unpack a single pixel into its RGBA components. + * Unpack a single pixel into its XYZW components. * * @param desc the pixel format for the packed pixel value * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM * - * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector. + * @return XYZW in a float[4] or ubyte[4] or ushort[4] vector. */ -static INLINE LLVMValueRef +static inline LLVMValueRef lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *desc, LLVMValueRef packed) @@ -156,10 +227,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef shifts[4]; LLVMValueRef masks[4]; LLVMValueRef scales[4]; + LLVMTypeRef vec32_type; boolean normalized; boolean needs_uitofp; - unsigned shift; unsigned i; /* TODO: Support more formats */ @@ -172,33 +243,27 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, * matches floating point size */ assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context)); -#ifdef PIPE_ARCH_BIG_ENDIAN - packed = lp_build_bswap(gallivm, packed, lp_type_uint(32)); -#endif + vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); /* Broadcast the packed value to all four channels * before: packed = BGRA * after: packed = {BGRA, BGRA, BGRA, BGRA} */ - packed = LLVMBuildInsertElement(builder, - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), - packed, + packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed, LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)), ""); - packed = LLVMBuildShuffleVector(builder, - packed, - LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), - LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)), + packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type), + LLVMConstNull(vec32_type), ""); /* Initialize vector constants */ normalized = FALSE; needs_uitofp = FALSE; - shift = 0; /* Loop over 4 color components */ for (i = 0; i < 4; ++i) { unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); @@ -224,15 +289,45 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, else scales[i] = lp_build_const_float(gallivm, 1.0); } - - shift += bits; } - /* Ex: convert packed = {BGRA, BGRA, BGRA, BGRA} - * into masked = {B, G, R, A} + /* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW} + * into masked = {X, Y, Z, W} */ - shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); - masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + if (desc->block.bits < 32 && normalized) { + /* + * Note: we cannot do the shift below on x86 natively until AVX2. + * + * Old llvm versions will resort to scalar extract/shift insert, + * which is definitely terrible, new versions will just do + * several vector shifts and shuffle/blend results together. + * We could turn this into a variable left shift plus a constant + * right shift, and llvm would then turn the variable left shift + * into a mul for us (albeit without sse41 the mul needs emulation + * too...). However, since we're going to do a float mul + * anyway, we just adjust that mul instead (plus the mask), skipping + * the shift completely. + * We could also use a extra mul when the format isn't normalized and + * we don't have AVX2 support, but don't bother for now. Unfortunately, + * this strategy doesn't work for 32bit formats (such as rgb10a2 or even + * rgba8 if it ends up here), as that would require UIToFP, albeit that + * would be fixable with easy 16bit shuffle (unless there's channels + * crossing 16bit boundaries). + */ + for (i = 0; i < 4; ++i) { + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { + unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; + unsigned long long mask = ((1ULL << bits) - 1) << shift; + scales[i] = lp_build_const_float(gallivm, 1.0 / mask); + masks[i] = lp_build_const_int32(gallivm, mask); + } + } + masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), ""); + } else { + shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); + masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + } if (!needs_uitofp) { /* UIToFP can't be expressed in SSE2 */ @@ -241,8 +336,10 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state *gallivm, casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), ""); } - /* At this point 'casted' may be a vector of floats such as - * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized + /* + * At this point 'casted' may be a vector of floats such as + * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied + * by powers of two). Next, if the pixel values are normalized * we'll scale this to {1.0, 1.0, 1.0, 1.0}. */ @@ -276,7 +373,6 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef shifts[4]; LLVMValueRef scales[4]; boolean normalized; - unsigned shift; unsigned i, j; assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); @@ -302,9 +398,9 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm, LLVMConstVector(swizzles, 4), ""); normalized = FALSE; - shift = 0; for (i = 0; i < 4; ++i) { unsigned bits = desc->channel[i].size; + unsigned shift = desc->channel[i].shift; if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) { shifts[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); @@ -325,8 +421,6 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm, else scales[i] = lp_build_const_float(gallivm, 1.0); } - - shift += bits; } if (normalized) @@ -366,6 +460,7 @@ lp_build_pack_rgba_aos(struct gallivm_state *gallivm, * Fetch a pixel into a 4 float AoS. * * \param format_desc describes format of the image we're fetching from + * \param aligned whether the data is guaranteed to be aligned * \param ptr address of the pixel block (or the texel if uncompressed) * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0, 0). @@ -375,10 +470,12 @@ LLVMValueRef lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, + boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + LLVMValueRef cache) { LLVMBuilderRef builder = gallivm->builder; unsigned num_pixels = type.length / 4; @@ -398,9 +495,11 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type); + struct lp_type fetch_type; unsigned vec_len = type.width * type.length; /* @@ -408,21 +507,98 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, * scaling or converting. */ + fetch_type = lp_type_uint(type.width*4); packed = lp_build_gather(gallivm, type.length/4, - format_desc->block.bits, type.width*4, - base_ptr, offset); + format_desc->block.bits, fetch_type, + aligned, base_ptr, offset, TRUE); assert(format_desc->block.bits <= vec_len); + (void) vec_len; /* silence unused var warning for non-debug build */ packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, ""); -#ifdef PIPE_ARCH_BIG_ENDIAN - if (type.floating) - packed = lp_build_bswap_vec(gallivm, packed, type, - lp_type_float_vec(type.width, vec_len)); -#endif return lp_build_format_swizzle_aos(format_desc, &bld, packed); } + /* + * Bit arithmetic for converting small_unorm to unorm8. + * + * This misses some opportunities for optimizations (like skipping mask + * for the highest channel for instance, or doing bit scaling in parallel + * for channels with the same bit width) but it should be passable for + * all arithmetic formats. + */ + if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB && + util_format_fits_8unorm(format_desc) && + type.width == 8 && type.norm == 1 && type.sign == 0 && + type.fixed == 0 && type.floating == 0) { + LLVMValueRef packed, res, chans[4], rgba[4]; + LLVMTypeRef dst_vec_type, conv_vec_type; + struct lp_type fetch_type, conv_type; + struct lp_build_context bld_conv; + unsigned j; + + fetch_type = lp_type_uint(type.width*4); + conv_type = lp_type_int_vec(type.width*4, type.width * type.length); + dst_vec_type = lp_build_vec_type(gallivm, type); + conv_vec_type = lp_build_vec_type(gallivm, conv_type); + lp_build_context_init(&bld_conv, gallivm, conv_type); + + packed = lp_build_gather(gallivm, type.length/4, + format_desc->block.bits, fetch_type, + aligned, base_ptr, offset, TRUE); + + assert(format_desc->block.bits * type.length / 4 <= + type.width * type.length); + + packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, ""); + + for (j = 0; j < format_desc->nr_channels; ++j) { + unsigned mask = 0; + unsigned sa = format_desc->channel[j].shift; + + mask = (1 << format_desc->channel[j].size) - 1; + + /* Extract bits from source */ + chans[j] = LLVMBuildLShr(builder, packed, + lp_build_const_int_vec(gallivm, conv_type, sa), + ""); + + chans[j] = LLVMBuildAnd(builder, chans[j], + lp_build_const_int_vec(gallivm, conv_type, mask), + ""); + + /* Scale bits */ + if (type.norm) { + chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size, + type.width, chans[j], conv_type); + } + } + /* + * This is a hacked lp_build_format_swizzle_soa() since we need a + * normalized 1 but only 8 bits in a 32bit vector... + */ + for (j = 0; j < 4; ++j) { + enum pipe_swizzle swizzle = format_desc->swizzle[j]; + if (swizzle == PIPE_SWIZZLE_1) { + rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1); + } else { + rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle); + } + if (j == 0) { + res = rgba[j]; + } else { + rgba[j] = LLVMBuildShl(builder, rgba[j], + lp_build_const_int_vec(gallivm, conv_type, + j * type.width), ""); + res = LLVMBuildOr(builder, res, rgba[j], ""); + } + } + res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, ""); + + return res; + } + /* * Bit arithmetic */ @@ -432,6 +608,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && + /* XXX this shouldn't be needed */ util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && @@ -441,8 +618,15 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, !format_desc->channel[0].pure_integer) { LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; - LLVMValueRef res; - unsigned k; + LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128]; + struct lp_type conv_type; + unsigned k, num_conv_src, num_conv_dst; + + /* + * Note this path is generally terrible for fetching multiple pixels. + * We should make sure we cannot hit this code path for anything but + * single pixels. + */ /* * Unpack a pixel at a time into a <4 x float> RGBA vector @@ -452,8 +636,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef packed; packed = lp_build_gather_elem(gallivm, num_pixels, - format_desc->block.bits, 32, - base_ptr, offset, k); + format_desc->block.bits, 32, aligned, + base_ptr, offset, k, FALSE); tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm, format_desc, @@ -472,12 +656,38 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, __FUNCTION__, format_desc->short_name); } - lp_build_conv(gallivm, - lp_float32_vec4_type(), - type, - tmps, num_pixels, &res, 1); + conv_type = lp_float32_vec4_type(); + num_conv_src = num_pixels; + num_conv_dst = 1; + + if (num_pixels % 8 == 0) { + lp_build_concat_n(gallivm, lp_float32_vec4_type(), + tmps, num_pixels, tmps, num_pixels / 2); + conv_type.length *= num_pixels / 4; + num_conv_src = 4 * num_pixels / 8; + if (type.width == 8 && type.floating == 0 && type.fixed == 0) { + /* + * FIXME: The fast float->unorm path (which is basically + * skipping the MIN/MAX which are extremely pointless in any + * case) requires that there's 2 destinations... + * In any case, we really should make sure we don't hit this + * code with multiple pixels for unorm8 dst types, it's + * completely hopeless even if we do hit the right conversion. + */ + type.length /= num_pixels / 4; + num_conv_dst = num_pixels / 4; + } + } + + lp_build_conv(gallivm, conv_type, type, + tmps, num_conv_src, res, num_conv_dst); - return lp_build_format_swizzle_aos(format_desc, &bld, res); + if (num_pixels % 8 == 0 && + (type.width == 8 && type.floating == 0 && type.fixed == 0)) { + lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1); + } + + return lp_build_format_swizzle_aos(format_desc, &bld, res[0]); } /* If all channels are of same type and we are not using half-floats */ @@ -514,6 +724,34 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, return tmp; } + /* + * s3tc rgb formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_cached_texels(gallivm, + format_desc, + num_pixels, + base_ptr, + offset, + i, j, + cache); + + lp_build_conv(gallivm, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + /* * Fallback to util_format_description::fetch_rgba_8unorm(). */ @@ -562,7 +800,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, arg_types[2] = i32t; arg_types[3] = i32t; function_type = LLVMFunctionType(ret_type, arg_types, - Elements(arg_types), 0); + ARRAY_SIZE(arg_types), 0); /* make const pointer for the C fetch_rgba_8unorm function */ function = lp_build_const_int_pointer(gallivm, @@ -600,7 +838,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, args[3] = LLVMBuildExtractElement(builder, j, index, ""); } - LLVMBuildCall(builder, function, args, Elements(args), ""); + LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); tmp = LLVMBuildLoad(builder, tmp_ptr, ""); @@ -669,7 +907,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, function = lp_build_const_func_pointer(gallivm, func_to_pointer((func_pointer) format_desc->fetch_rgba_float), ret_type, - arg_types, Elements(arg_types), + arg_types, ARRAY_SIZE(arg_types), format_desc->short_name); } @@ -697,7 +935,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, args[3] = LLVMBuildExtractElement(builder, j, index, ""); } - LLVMBuildCall(builder, function, args, Elements(args), ""); + LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), ""); tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); } @@ -710,6 +948,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, return res; } + assert(!util_format_is_pure_integer(format_desc->format)); + assert(0); return lp_build_undef(gallivm, type); }