+
+
+
+
+/**
+ * Fetch a pixel into a 4 float AoS.
+ *
+ * \param format_desc describes format of the image we're fetching from
+ * \param aligned whether the data is guaranteed to be aligned
+ * \param ptr address of the pixel block (or the texel if uncompressed)
+ * \param i, j the sub-block pixel coordinates. For non-compressed formats
+ * these will always be (0, 0).
+ * \return a 4 element vector with the pixel's RGBA values.
+ */
+LLVMValueRef
+lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type type,
+ boolean aligned,
+ LLVMValueRef base_ptr,
+ LLVMValueRef offset,
+ LLVMValueRef i,
+ LLVMValueRef j,
+ LLVMValueRef cache)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned num_pixels = type.length / 4;
+ struct lp_build_context bld;
+
+ assert(type.length <= LP_MAX_VECTOR_LENGTH);
+ assert(type.length % 4 == 0);
+
+ lp_build_context_init(&bld, gallivm, type);
+
+ /*
+ * Trivial case
+ *
+ * The format matches the type (apart of a swizzle) so no need for
+ * scaling or converting.
+ */
+
+ if (format_matches_type(format_desc, type) &&
+ format_desc->block.bits <= type.width * 4 &&
+ /* XXX this shouldn't be needed */
+ util_is_power_of_two(format_desc->block.bits)) {
+ LLVMValueRef packed;
+ LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+ struct lp_type fetch_type;
+ unsigned vec_len = type.width * type.length;
+
+ /*
+ * The format matches the type (apart of a swizzle) so no need for
+ * scaling or converting.
+ */
+
+ fetch_type = lp_type_uint(type.width*4);
+ packed = lp_build_gather(gallivm, type.length/4,
+ format_desc->block.bits, fetch_type,
+ aligned, base_ptr, offset, TRUE);
+
+ assert(format_desc->block.bits <= vec_len);
+ (void) vec_len; /* silence unused var warning for non-debug build */
+
+ packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
+ return lp_build_format_swizzle_aos(format_desc, &bld, packed);
+ }
+
+ /*
+ * Bit arithmetic for converting small_unorm to unorm8.
+ *
+ * This misses some opportunities for optimizations (like skipping mask
+ * for the highest channel for instance, or doing bit scaling in parallel
+ * for channels with the same bit width) but it should be passable for
+ * all arithmetic formats.
+ */
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+ format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+ util_format_fits_8unorm(format_desc) &&
+ type.width == 8 && type.norm == 1 && type.sign == 0 &&
+ type.fixed == 0 && type.floating == 0) {
+ LLVMValueRef packed, res, chans[4], rgba[4];
+ LLVMTypeRef dst_vec_type, conv_vec_type;
+ struct lp_type fetch_type, conv_type;
+ struct lp_build_context bld_conv;
+ unsigned j;
+
+ fetch_type = lp_type_uint(type.width*4);
+ conv_type = lp_type_int_vec(type.width*4, type.width * type.length);
+ dst_vec_type = lp_build_vec_type(gallivm, type);
+ conv_vec_type = lp_build_vec_type(gallivm, conv_type);
+ lp_build_context_init(&bld_conv, gallivm, conv_type);
+
+ packed = lp_build_gather(gallivm, type.length/4,
+ format_desc->block.bits, fetch_type,
+ aligned, base_ptr, offset, TRUE);
+
+ assert(format_desc->block.bits * type.length / 4 <=
+ type.width * type.length);
+
+ packed = LLVMBuildBitCast(gallivm->builder, packed, conv_vec_type, "");
+
+ for (j = 0; j < format_desc->nr_channels; ++j) {
+ unsigned mask = 0;
+ unsigned sa = format_desc->channel[j].shift;
+
+ mask = (1 << format_desc->channel[j].size) - 1;
+
+ /* Extract bits from source */
+ chans[j] = LLVMBuildLShr(builder, packed,
+ lp_build_const_int_vec(gallivm, conv_type, sa),
+ "");
+
+ chans[j] = LLVMBuildAnd(builder, chans[j],
+ lp_build_const_int_vec(gallivm, conv_type, mask),
+ "");
+
+ /* Scale bits */
+ if (type.norm) {
+ chans[j] = scale_bits_up(gallivm, format_desc->channel[j].size,
+ type.width, chans[j], conv_type);
+ }
+ }
+ /*
+ * This is a hacked lp_build_format_swizzle_soa() since we need a
+ * normalized 1 but only 8 bits in a 32bit vector...
+ */
+ for (j = 0; j < 4; ++j) {
+ enum pipe_swizzle swizzle = format_desc->swizzle[j];
+ if (swizzle == PIPE_SWIZZLE_1) {
+ rgba[j] = lp_build_const_int_vec(gallivm, conv_type, (1 << type.width) - 1);
+ } else {
+ rgba[j] = lp_build_swizzle_soa_channel(&bld_conv, chans, swizzle);
+ }
+ if (j == 0) {
+ res = rgba[j];
+ } else {
+ rgba[j] = LLVMBuildShl(builder, rgba[j],
+ lp_build_const_int_vec(gallivm, conv_type,
+ j * type.width), "");
+ res = LLVMBuildOr(builder, res, rgba[j], "");
+ }
+ }
+ res = LLVMBuildBitCast(gallivm->builder, res, dst_vec_type, "");
+
+ return res;
+ }
+
+ /*
+ * Bit arithmetic
+ */
+
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+ (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+ format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
+ format_desc->block.width == 1 &&
+ format_desc->block.height == 1 &&
+ /* XXX this shouldn't be needed */
+ util_is_power_of_two(format_desc->block.bits) &&
+ format_desc->block.bits <= 32 &&
+ format_desc->is_bitmask &&
+ !format_desc->is_mixed &&
+ (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+ format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED) &&
+ !format_desc->channel[0].pure_integer) {
+
+ LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef res[LP_MAX_VECTOR_WIDTH / 128];
+ struct lp_type conv_type;
+ unsigned k, num_conv_src, num_conv_dst;
+
+ /*
+ * Note this path is generally terrible for fetching multiple pixels.
+ * We should make sure we cannot hit this code path for anything but
+ * single pixels.
+ */
+
+ /*
+ * Unpack a pixel at a time into a <4 x float> RGBA vector
+ */
+
+ for (k = 0; k < num_pixels; ++k) {
+ LLVMValueRef packed;
+
+ packed = lp_build_gather_elem(gallivm, num_pixels,
+ format_desc->block.bits, 32, aligned,
+ base_ptr, offset, k, FALSE);
+
+ tmps[k] = lp_build_unpack_arith_rgba_aos(gallivm,
+ format_desc,
+ packed);
+ }
+
+ /*
+ * Type conversion.
+ *
+ * TODO: We could avoid floating conversion for integer to
+ * integer conversions.
+ */
+
+ if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
+ debug_printf("%s: unpacking %s with floating point\n",
+ __FUNCTION__, format_desc->short_name);
+ }
+
+ conv_type = lp_float32_vec4_type();
+ num_conv_src = num_pixels;
+ num_conv_dst = 1;
+
+ if (num_pixels % 8 == 0) {
+ lp_build_concat_n(gallivm, lp_float32_vec4_type(),
+ tmps, num_pixels, tmps, num_pixels / 2);
+ conv_type.length *= num_pixels / 4;
+ num_conv_src = 4 * num_pixels / 8;
+ if (type.width == 8 && type.floating == 0 && type.fixed == 0) {
+ /*
+ * FIXME: The fast float->unorm path (which is basically
+ * skipping the MIN/MAX which are extremely pointless in any
+ * case) requires that there's 2 destinations...
+ * In any case, we really should make sure we don't hit this
+ * code with multiple pixels for unorm8 dst types, it's
+ * completely hopeless even if we do hit the right conversion.
+ */
+ type.length /= num_pixels / 4;
+ num_conv_dst = num_pixels / 4;
+ }
+ }
+
+ lp_build_conv(gallivm, conv_type, type,
+ tmps, num_conv_src, res, num_conv_dst);
+
+ if (num_pixels % 8 == 0 &&
+ (type.width == 8 && type.floating == 0 && type.fixed == 0)) {
+ lp_build_concat_n(gallivm, type, res, num_conv_dst, res, 1);
+ }
+
+ return lp_build_format_swizzle_aos(format_desc, &bld, res[0]);
+ }
+
+ /* If all channels are of same type and we are not using half-floats */
+ if (format_desc->is_array &&
+ format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) {
+ assert(!format_desc->is_mixed);
+ return lp_build_fetch_rgba_aos_array(gallivm, format_desc, type, base_ptr, offset);
+ }
+
+ /*
+ * YUV / subsampled formats
+ */
+
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+ struct lp_type tmp_type;
+ LLVMValueRef tmp;
+
+ memset(&tmp_type, 0, sizeof tmp_type);
+ tmp_type.width = 8;
+ tmp_type.length = num_pixels * 4;
+ tmp_type.norm = TRUE;
+
+ tmp = lp_build_fetch_subsampled_rgba_aos(gallivm,
+ format_desc,
+ num_pixels,
+ base_ptr,
+ offset,
+ i, j);
+
+ lp_build_conv(gallivm,
+ tmp_type, type,
+ &tmp, 1, &tmp, 1);
+
+ return tmp;
+ }
+
+ /*
+ * s3tc rgb formats
+ */
+
+ if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
+ struct lp_type tmp_type;
+ LLVMValueRef tmp;
+
+ memset(&tmp_type, 0, sizeof tmp_type);
+ tmp_type.width = 8;
+ tmp_type.length = num_pixels * 4;
+ tmp_type.norm = TRUE;
+
+ tmp = lp_build_fetch_cached_texels(gallivm,
+ format_desc,
+ num_pixels,
+ base_ptr,
+ offset,
+ i, j,
+ cache);
+
+ lp_build_conv(gallivm,
+ tmp_type, type,
+ &tmp, 1, &tmp, 1);
+
+ return tmp;
+ }
+
+ /*
+ * Fallback to util_format_description::fetch_rgba_8unorm().
+ */
+
+ if (format_desc->fetch_rgba_8unorm &&
+ !type.floating && type.width == 8 && !type.sign && type.norm) {
+ /*
+ * Fallback to calling util_format_description::fetch_rgba_8unorm.
+ *
+ * This is definitely not the most efficient way of fetching pixels, as
+ * we miss the opportunity to do vectorization, but this it is a
+ * convenient for formats or scenarios for which there was no opportunity
+ * or incentive to optimize.
+ */
+
+ LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+ LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef function;
+ LLVMValueRef tmp_ptr;
+ LLVMValueRef tmp;
+ LLVMValueRef res;
+ unsigned k;
+
+ if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+ debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
+ __FUNCTION__, format_desc->short_name);
+ }
+
+ /*
+ * Declare and bind format_desc->fetch_rgba_8unorm().
+ */
+
+ {
+ /*
+ * Function to call looks like:
+ * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+ */
+ LLVMTypeRef ret_type;
+ LLVMTypeRef arg_types[4];
+ LLVMTypeRef function_type;
+
+ ret_type = LLVMVoidTypeInContext(gallivm->context);
+ arg_types[0] = pi8t;
+ arg_types[1] = pi8t;
+ arg_types[2] = i32t;
+ arg_types[3] = i32t;
+ function_type = LLVMFunctionType(ret_type, arg_types,
+ ARRAY_SIZE(arg_types), 0);
+
+ /* make const pointer for the C fetch_rgba_8unorm function */
+ function = lp_build_const_int_pointer(gallivm,
+ func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
+
+ /* cast the callee pointer to the function's type */
+ function = LLVMBuildBitCast(builder, function,
+ LLVMPointerType(function_type, 0),
+ "cast callee");
+ }
+
+ tmp_ptr = lp_build_alloca(gallivm, i32t, "");
+
+ res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
+
+ /*
+ * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
+ * in the SoA vectors.
+ */
+
+ for (k = 0; k < num_pixels; ++k) {
+ LLVMValueRef index = lp_build_const_int32(gallivm, k);
+ LLVMValueRef args[4];
+
+ args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+ args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
+ base_ptr, offset, k);
+
+ if (num_pixels == 1) {
+ args[2] = i;
+ args[3] = j;
+ }
+ else {
+ args[2] = LLVMBuildExtractElement(builder, i, index, "");
+ args[3] = LLVMBuildExtractElement(builder, j, index, "");
+ }
+
+ LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
+
+ tmp = LLVMBuildLoad(builder, tmp_ptr, "");
+
+ if (num_pixels == 1) {
+ res = tmp;
+ }
+ else {
+ res = LLVMBuildInsertElement(builder, res, tmp, index, "");
+ }
+ }
+
+ /* Bitcast from <n x i32> to <4n x i8> */
+ res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
+
+ return res;
+ }
+
+ /*
+ * Fallback to util_format_description::fetch_rgba_float().
+ */
+
+ if (format_desc->fetch_rgba_float) {
+ /*
+ * Fallback to calling util_format_description::fetch_rgba_float.
+ *
+ * This is definitely not the most efficient way of fetching pixels, as
+ * we miss the opportunity to do vectorization, but this it is a
+ * convenient for formats or scenarios for which there was no opportunity
+ * or incentive to optimize.
+ */
+
+ LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
+ LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
+ LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
+ LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
+ LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+ LLVMValueRef function;
+ LLVMValueRef tmp_ptr;
+ LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+ LLVMValueRef res;
+ unsigned k;
+
+ if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+ debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
+ __FUNCTION__, format_desc->short_name);
+ }
+
+ /*
+ * Declare and bind format_desc->fetch_rgba_float().
+ */
+
+ {
+ /*
+ * Function to call looks like:
+ * fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
+ */
+ LLVMTypeRef ret_type;
+ LLVMTypeRef arg_types[4];
+
+ ret_type = LLVMVoidTypeInContext(gallivm->context);
+ arg_types[0] = pf32t;
+ arg_types[1] = pi8t;
+ arg_types[2] = i32t;
+ arg_types[3] = i32t;
+
+ function = lp_build_const_func_pointer(gallivm,
+ func_to_pointer((func_pointer) format_desc->fetch_rgba_float),
+ ret_type,
+ arg_types, ARRAY_SIZE(arg_types),
+ format_desc->short_name);
+ }
+
+ tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
+
+ /*
+ * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
+ * in the SoA vectors.
+ */
+
+ for (k = 0; k < num_pixels; ++k) {
+ LLVMValueRef args[4];
+
+ args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
+ args[1] = lp_build_gather_elem_ptr(gallivm, num_pixels,
+ base_ptr, offset, k);
+
+ if (num_pixels == 1) {
+ args[2] = i;
+ args[3] = j;
+ }
+ else {
+ LLVMValueRef index = lp_build_const_int32(gallivm, k);
+ args[2] = LLVMBuildExtractElement(builder, i, index, "");
+ args[3] = LLVMBuildExtractElement(builder, j, index, "");
+ }
+
+ LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
+
+ tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
+ }
+
+ lp_build_conv(gallivm,
+ lp_float32_vec4_type(),
+ type,
+ tmps, num_pixels, &res, 1);
+
+ return res;
+ }
+
+ assert(!util_format_is_pure_integer(format_desc->format));
+
+ assert(0);
+ return lp_build_undef(gallivm, type);
+}