#include "util/u_math.h"
#include "util/u_pointer.h"
#include "util/u_string.h"
+#include "util/u_cpu_detect.h"
#include "lp_bld_arit.h"
#include "lp_bld_init.h"
#include "lp_bld_gather.h"
#include "lp_bld_debug.h"
#include "lp_bld_format.h"
+#include "lp_bld_pack.h"
#include "lp_bld_intr.h"
LLVMValueRef shifts[4];
LLVMValueRef masks[4];
LLVMValueRef scales[4];
+ LLVMTypeRef vec32_type;
boolean normalized;
boolean needs_uitofp;
* matches floating point size */
assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
+ vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+
/* Broadcast the packed value to all four channels
* before: packed = BGRA
* after: packed = {BGRA, BGRA, BGRA, BGRA}
*/
- packed = LLVMBuildInsertElement(builder,
- LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
- packed,
+ packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,
LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
"");
- packed = LLVMBuildShuffleVector(builder,
- packed,
- LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
- LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
+ packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
+ LLVMConstNull(vec32_type),
"");
/* Initialize vector constants */
/* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
* into masked = {X, Y, Z, W}
*/
- /* Note: we cannot do this shift on x86 natively until AVX2. */
- shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
- masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+ if (desc->block.bits < 32 && normalized) {
+ /*
+ * Note: we cannot do the shift below on x86 natively until AVX2.
+ *
+ * Old llvm versions will resort to scalar extract/shift insert,
+ * which is definitely terrible, new versions will just do
+ * several vector shifts and shuffle/blend results together.
+ * We could turn this into a variable left shift plus a constant
+ * right shift, and llvm would then turn the variable left shift
+ * into a mul for us (albeit without sse41 the mul needs emulation
+ * too...). However, since we're going to do a float mul
+ * anyway, we just adjust that mul instead (plus the mask), skipping
+ * the shift completely.
+ * We could also use a extra mul when the format isn't normalized and
+ * we don't have AVX2 support, but don't bother for now. Unfortunately,
+ * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
+ * rgba8 if it ends up here), as that would require UIToFP, albeit that
+ * would be fixable with easy 16bit shuffle (unless there's channels
+ * crossing 16bit boundaries).
+ */
+ for (i = 0; i < 4; ++i) {
+ if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+ unsigned bits = desc->channel[i].size;
+ unsigned shift = desc->channel[i].shift;
+ unsigned long long mask = ((1ULL << bits) - 1) << shift;
+ scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
+ masks[i] = lp_build_const_int32(gallivm, mask);
+ }
+ }
+ masked = LLVMBuildAnd(builder, packed, LLVMConstVector(masks, 4), "");
+ } else {
+ shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
+ masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+ }
if (!needs_uitofp) {
/* UIToFP can't be expressed in SSE2 */
casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4), "");
}
- /* At this point 'casted' may be a vector of floats such as
- * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized
+ /*
+ * At this point 'casted' may be a vector of floats such as
+ * {255.0, 255.0, 255.0, 255.0}. (Normalized values may be multiplied
+ * by powers of two). Next, if the pixel values are normalized
* we'll scale this to {1.0, 1.0, 1.0, 1.0}.
*/
if (format_matches_type(format_desc, type) &&
format_desc->block.bits <= type.width * 4 &&
+ /* XXX this shouldn't be needed */
util_is_power_of_two(format_desc->block.bits)) {
LLVMValueRef packed;
LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
format_desc->block.width == 1 &&
format_desc->block.height == 1 &&
+ /* XXX this shouldn't be needed */
util_is_power_of_two(format_desc->block.bits) &&
format_desc->block.bits <= 32 &&
format_desc->is_bitmask &&
!format_desc->channel[0].pure_integer) {
LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
- LLVMValueRef res;
- unsigned k;
+ LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
+ struct lp_type conv_type;
+ unsigned k, num_conv_src, num_conv_dst;
+
+ /*
+ * XXX: We end up here for the AoS unorm8 sampling (if the format wasn't some
+ * 888(8) variant), so things like rgb565. This is _really_ suboptimal.
+ * Not only do we a single pixel at a time but we convert to float,
+ * do a normalize mul, un-normalize mul, convert back to int, finally pack
+ * down to 8 bits. At the end throw in a couple of shifts/ands/ors for aos
+ * swizzle (well rgb565 is ok but bgrx5551 not for instance) for good
+ * measure. (And if we're not extra careful we get some pointless min/max
+ * too for clamping values to range). This is a disaster of epic proportions,
+ * simply forcing SoA sampling would be way faster (even when we don't have
+ * AVX support).
+ * We should make sure we cannot hit this code path for anything but single
+ * pixels.
+ */
/*
* Unpack a pixel at a time into a <4 x float> RGBA vector
__FUNCTION__, format_desc->short_name);
}
- lp_build_conv(gallivm,
- lp_float32_vec4_type(),
- type,
- tmps, num_pixels, &res, 1);
+ conv_type = lp_float32_vec4_type();
+ num_conv_src = num_pixels;
+ num_conv_dst = 1;
+
+ if (num_pixels % 8 == 0) {
+ lp_build_concat_n(gallivm, lp_float32_vec4_type(),
+ tmps, num_pixels, tmps, num_pixels / 2);
+ conv_type.length *= num_pixels / 4;
+ num_conv_src = 4 * num_pixels / 8;
+ if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
+ /*
+ * FIXME: The fast float->unorm path (which is basically
+ * skipping the MIN/MAX which are extremely pointless in any
+ * case) requires that there's 2 destinations...
+ * In any case, we really should make sure we don't hit this
+ * code with multiple pixels for unorm8 dst types, it's
+ * completely hopeless even if we do hit the right conversion.
+ */
+ type.length /= num_pixels / 4;
+ num_conv_dst = num_pixels / 4;
+ }
+ }
+
+ lp_build_conv(gallivm, conv_type, type,
+ tmps, num_conv_src, res, num_conv_dst);
+
+ if (num_pixels % 8 == 0 &&
+ (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
+ lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
+ }
- return lp_build_format_swizzle_aos(format_desc, &bld, res);
+ return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
}
/* If all channels are of same type and we are not using half-floats */