}
+/**
+ * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
+ *
+ * Returns the number of dsts created from src
+ */
+int lp_build_conv_auto(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type* dst_type,
+ const LLVMValueRef *src,
+ unsigned num_srcs,
+ LLVMValueRef *dst)
+{
+ int i;
+ int num_dsts = num_srcs;
+
+ if (src_type.floating == dst_type->floating &&
+ src_type.width == dst_type->width &&
+ src_type.length == dst_type->length &&
+ src_type.fixed == dst_type->fixed &&
+ src_type.norm == dst_type->norm &&
+ src_type.sign == dst_type->sign)
+ return num_dsts;
+
+ /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
+ */
+ if (src_type.floating == 1 &&
+ src_type.fixed == 0 &&
+ src_type.sign == 1 &&
+ src_type.norm == 0 &&
+ src_type.width == 32 &&
+
+ dst_type->floating == 0 &&
+ dst_type->fixed == 0 &&
+ dst_type->sign == 0 &&
+ dst_type->norm == 1 &&
+ dst_type->width == 8)
+ {
+ /* Special case 4x4f --> 1x16ub */
+ if (src_type.length == 4 && util_cpu_caps.has_sse2)
+ {
+ assert((num_srcs % 4) == 0);
+
+ num_dsts = num_srcs / 4;
+ dst_type->length = 16;
+
+ lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
+ return num_dsts;
+ }
+
+ /* Special case 2x8f --> 1x16ub */
+ if (src_type.length == 8 && util_cpu_caps.has_avx)
+ {
+ assert((num_srcs % 2) == 0);
+
+ num_dsts = num_srcs / 2;
+ dst_type->length = 16;
+
+ lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
+ return num_dsts;
+ }
+ }
+
+ /* lp_build_resize does not support M:N */
+ if (src_type.width == dst_type->width) {
+ lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
+ } else {
+ for (i = 0; i < num_srcs; ++i) {
+ lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
+ }
+ }
+
+ return num_dsts;
+}
+
+
/**
* Generic type conversion.
*
const LLVMValueRef *srcs, unsigned num_srcs,
LLVMValueRef *dsts, unsigned num_dsts);
+
+int
+lp_build_conv_auto(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ struct lp_type* dst_type,
+ const LLVMValueRef *src,
+ unsigned num_srcs,
+ LLVMValueRef *dst);
+
+
void
lp_build_conv_mask(struct gallivm_state *gallivm,
struct lp_type src_type,
lp_build_select_aos(struct lp_build_context *bld,
unsigned mask,
LLVMValueRef a,
- LLVMValueRef b)
+ LLVMValueRef b,
+ unsigned num_channels)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
- for(j = 0; j < n; j += 4)
- for(i = 0; i < 4; ++i)
+ for(j = 0; j < n; j += num_channels)
+ for(i = 0; i < num_channels; ++i)
shuffles[j + i] = LLVMConstInt(elem_type,
(mask & (1 << i) ? 0 : n) + j + i,
0);
return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), "");
}
else {
- LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, 4);
+ LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels);
return lp_build_select(bld, mask_vec, a, b);
}
}
lp_build_select_aos(struct lp_build_context *bld,
unsigned mask,
LLVMValueRef a,
- LLVMValueRef b);
+ LLVMValueRef b,
+ unsigned num_channels);
LLVMValueRef
return tmp[0];
}
+
+/**
+ * Combines vectors to reduce from num_srcs to num_dsts.
+ * Returns the number of src vectors concatenated in a single dst.
+ *
+ * num_srcs must be exactly divisible by num_dsts.
+ *
+ * e.g. For num_srcs = 4 and src = [x, y, z, w]
+ * num_dsts = 1 dst = [xyzw] return = 4
+ * num_dsts = 2 dst = [xy, zw] return = 2
+ */
+int
+lp_build_concat_n(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef *src,
+ unsigned num_srcs,
+ LLVMValueRef *dst,
+ unsigned num_dsts)
+{
+ int size = num_srcs / num_dsts;
+ int i;
+
+ assert(num_srcs >= num_dsts);
+ assert((num_srcs % size) == 0);
+
+ if (num_srcs == num_dsts)
+ return 1;
+
+ for (i = 0; i < num_dsts; ++i) {
+ dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
+ }
+
+ return size;
+}
+
+
/**
* Interleave vector elements.
*
struct lp_type src_type,
unsigned num_vectors);
+int
+lp_build_concat_n(struct gallivm_state *gallivm,
+ struct lp_type src_type,
+ LLVMValueRef *src,
+ unsigned num_srcs,
+ LLVMValueRef *dst,
+ unsigned num_dsts);
+
+
LLVMValueRef
lp_build_packs2(struct gallivm_state *gallivm,
struct lp_type src_type,
#include "lp_bld_const.h"
#include "lp_bld_swizzle.h"
#include "lp_bld_quad.h"
+#include "lp_bld_pack.h"
static const unsigned char
return LLVMBuildSub(builder, vec2, vec1, "ddxddyddxddy");
}
+
+/**
+ * Twiddle from quad format to row format
+ *
+ * src0 src1
+ * ######### ######### #################
+ * # 0 | 1 # # 4 | 5 # # 0 | 1 | 4 | 5 # src0
+ * #---+---# #---+---# -> #################
+ * # 2 | 3 # # 6 | 7 # # 2 | 3 | 6 | 7 # src1
+ * ######### ######### #################
+ *
+ */
+void
+lp_bld_quad_twiddle(struct gallivm_state *gallivm,
+ struct lp_type lp_dst_type,
+ const LLVMValueRef* src,
+ unsigned src_count,
+ LLVMValueRef* dst)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMTypeRef dst_type_ref;
+ LLVMTypeRef type2_ref;
+ struct lp_type type2;
+ unsigned i;
+
+ assert((src_count % 2) == 0);
+
+ /* Create a type with only 2 elements */
+ type2 = lp_dst_type;
+ type2.width = (lp_dst_type.width * lp_dst_type.length) / 2;
+ type2.length = 2;
+ type2.floating = 0;
+
+ type2_ref = lp_build_vec_type(gallivm, type2);
+ dst_type_ref = lp_build_vec_type(gallivm, lp_dst_type);
+
+ for (i = 0; i < src_count; i += 2) {
+ LLVMValueRef src0, src1;
+
+ src0 = LLVMBuildBitCast(builder, src[i + 0], type2_ref, "");
+ src1 = LLVMBuildBitCast(builder, src[i + 1], type2_ref, "");
+
+ dst[i + 0] = lp_build_interleave2(gallivm, type2, src0, src1, 0);
+ dst[i + 1] = lp_build_interleave2(gallivm, type2, src0, src1, 1);
+
+ dst[i + 0] = LLVMBuildBitCast(builder, dst[i + 0], dst_type_ref, "");
+ dst[i + 1] = LLVMBuildBitCast(builder, dst[i + 1], dst_type_ref, "");
+ }
+}
lp_build_packed_ddx_ddy_onecoord(struct lp_build_context *bld,
LLVMValueRef a);
+/*
+ * Twiddle from quad format to row format
+ */
+void
+lp_bld_quad_twiddle(struct gallivm_state *gallivm,
+ struct lp_type lp_dst_type,
+ const LLVMValueRef* src,
+ unsigned src_count,
+ LLVMValueRef* dst);
#endif /* LP_BLD_QUAD_H_ */
offset1 = LLVMBuildLoad(builder, offset1, "");
offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
}
- offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0);
+ offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
}
else {
unsigned i;
stride1 = LLVMBuildLoad(builder, stride1, "");
stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
}
- stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0);
+ stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
}
else {
LLVMValueRef stride1;
*out_width = size;
}
else if (bld->num_lods == num_quads) {
- *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0);
+ *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
if (dims >= 2) {
- *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1);
+ *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
if (dims == 3) {
- *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2);
+ *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
}
}
}
signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
- arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0);
- arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1);
- arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2);
+ arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0, 4);
+ arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1, 4);
+ arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2, 4);
/*
* select x if x >= y else select y
* snewz = signrz * rx;
* tnewz = -ry;
*/
- signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0);
+ signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0, 4);
snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
tnewx = ryneg;
- signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1);
+ signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1, 4);
snewy = rx;
tnewy = LLVMBuildXor(builder, signrys, rz, "");
- signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2);
+ signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2, 4);
snewz = LLVMBuildXor(builder, signrzs, rx, "");
tnewz = ryneg;
/**
- * Swizzle one channel into all other three channels.
+ * Swizzle one channel into other channels.
*/
LLVMValueRef
lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
LLVMValueRef a,
- unsigned channel)
+ unsigned channel,
+ unsigned num_channels)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const struct lp_type type = bld->type;
const unsigned n = type.length;
unsigned i, j;
- if(a == bld->undef || a == bld->zero || a == bld->one)
+ if(a == bld->undef || a == bld->zero || a == bld->one || num_channels == 1)
return a;
+ assert(num_channels == 2 || num_channels == 4);
+
/* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
* using shuffles here actually causes worst results. More investigation is
* needed. */
LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
- for(j = 0; j < n; j += 4)
- for(i = 0; i < 4; ++i)
+ for(j = 0; j < n; j += num_channels)
+ for(i = 0; i < num_channels; ++i)
shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
return LLVMBuildShuffleVector(builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
}
+ else if (num_channels == 2) {
+ /*
+ * Bit mask and shifts
+ *
+ * XY XY .... XY <= input
+ * 0Y 0Y .... 0Y
+ * YY YY .... YY
+ * YY YY .... YY <= output
+ */
+ struct lp_type type2;
+ LLVMValueRef tmp = NULL;
+ int shift;
+
+ a = LLVMBuildAnd(builder, a,
+ lp_build_const_mask_aos(bld->gallivm,
+ type, 1 << channel, num_channels), "");
+
+ type2 = type;
+ type2.floating = FALSE;
+ type2.width *= 2;
+ type2.length /= 2;
+
+ a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), "");
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+ shift = channel == 0 ? 1 : -1;
+#else
+ shift = channel == 0 ? -1 : 1;
+#endif
+
+ if (shift > 0) {
+ tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type2, shift * type.width), "");
+ } else if (shift < 0) {
+ tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type2, -shift * type.width), "");
+ }
+
+ assert(tmp);
+ if (tmp) {
+ a = LLVMBuildOr(builder, a, tmp, "");
+ }
+
+ return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
+ }
else {
/*
* Bit mask and recursive shifts
}
+/**
+ * Swizzle a vector consisting of an array of XYZW structs.
+ *
+ * This fills a vector of dst_len length with the swizzled channels from src.
+ *
+ * e.g. with swizzles = { 2, 1, 0 } and swizzle_count = 6 results in
+ * RGBA RGBA = BGR BGR BG
+ *
+ * @param swizzles the swizzle array
+ * @param num_swizzles the number of elements in swizzles
+ * @param dst_len the length of the result
+ */
+LLVMValueRef
+lp_build_swizzle_aos_n(struct gallivm_state* gallivm,
+ LLVMValueRef src,
+ const unsigned char* swizzles,
+ unsigned num_swizzles,
+ unsigned dst_len)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH];
+ unsigned i;
+
+ assert(dst_len < LP_MAX_VECTOR_WIDTH);
+
+ for (i = 0; i < dst_len; ++i) {
+ int swizzle = swizzles[i % num_swizzles];
+
+ if (swizzle == LP_BLD_SWIZZLE_DONTCARE) {
+ shuffles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ } else {
+ shuffles[i] = lp_build_const_int32(gallivm, swizzle);
+ }
+ }
+
+ return LLVMBuildShuffleVector(builder, src, LLVMGetUndef(LLVMTypeOf(src)), LLVMConstVector(shuffles, dst_len), "");
+}
+
+
LLVMValueRef
lp_build_swizzle_aos(struct lp_build_context *bld,
LLVMValueRef a,
case PIPE_SWIZZLE_GREEN:
case PIPE_SWIZZLE_BLUE:
case PIPE_SWIZZLE_ALPHA:
- return lp_build_swizzle_scalar_aos(bld, a, swizzles[0]);
+ return lp_build_swizzle_scalar_aos(bld, a, swizzles[0], 4);
case PIPE_SWIZZLE_ZERO:
return bld->zero;
case PIPE_SWIZZLE_ONE:
cond |= 1 << chan;
}
}
- res = lp_build_select_aos(bld, cond, bld->one, bld->zero);
+ res = lp_build_select_aos(bld, cond, bld->one, bld->zero, 4);
/*
* Build a type where each element is an integer that cover the four
}
+/**
+ * Transpose from AOS <-> SOA for num_srcs
+ */
+void
+lp_build_transpose_aos_n(struct gallivm_state *gallivm,
+ struct lp_type type,
+ const LLVMValueRef* src,
+ unsigned num_srcs,
+ LLVMValueRef* dst)
+{
+ switch (num_srcs) {
+ case 1:
+ dst[0] = src[0];
+ break;
+
+ case 2:
+ {
+ /* Note: we must use a temporary incase src == dst */
+ LLVMValueRef lo, hi;
+
+ lo = lp_build_interleave2_half(gallivm, type, src[0], src[1], 0);
+ hi = lp_build_interleave2_half(gallivm, type, src[0], src[1], 1);
+
+ dst[0] = lo;
+ dst[1] = hi;
+ break;
+ }
+
+ case 4:
+ lp_build_transpose_aos(gallivm, type, src, dst);
+ break;
+
+ default:
+ assert(0);
+ };
+}
+
+
/**
* Pack n-th element of aos values,
* pad out to destination size.
/**
- * Broadcast one channel of a vector composed of arrays of XYZW structures into
- * all four channel.
+ * Broadcast one channel of a vector composed of arrays of XYZ.. structures into
+ * all channels XXX...
*/
LLVMValueRef
lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
- LLVMValueRef a,
- unsigned channel);
+ LLVMValueRef a,
+ unsigned channel,
+ unsigned num_channels);
/**
const unsigned char swizzles[4]);
+LLVMValueRef
+lp_build_swizzle_aos_n(struct gallivm_state* gallivm,
+ LLVMValueRef src,
+ const unsigned char* swizzles,
+ unsigned num_swizzles,
+ unsigned dst_len);
+
+
LLVMValueRef
lp_build_swizzle_soa_channel(struct lp_build_context *bld,
const LLVMValueRef *unswizzled,
LLVMValueRef dst[4]);
+void
+lp_build_transpose_aos_n(struct gallivm_state *gallivm,
+ struct lp_type type,
+ const LLVMValueRef* src,
+ unsigned num_srcs,
+ LLVMValueRef* dst);
+
+
LLVMValueRef
lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
struct lp_type src_type,
unsigned chan)
{
chan = bld->swizzles[chan];
- return lp_build_swizzle_scalar_aos(&bld->bld_base.base, a, chan);
+ return lp_build_swizzle_scalar_aos(&bld->bld_base.base, a, chan, 4);
}
case TGSI_OPCODE_EX2:
src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL);
- tmp0 = lp_build_swizzle_scalar_aos(&bld->bld_base.base, src0, TGSI_SWIZZLE_X);
+ tmp0 = lp_build_swizzle_scalar_aos(&bld->bld_base.base, src0, TGSI_SWIZZLE_X, TGSI_NUM_CHANNELS);
dst0 = lp_build_exp2(&bld->bld_base.base, tmp0);
break;
struct lp_type type,
unsigned rt,
LLVMValueRef src,
+ LLVMValueRef src_alpha,
LLVMValueRef dst,
LLVMValueRef mask,
LLVMValueRef const_,
- const unsigned char swizzle[4]);
+ LLVMValueRef const_alpha,
+ const unsigned char swizzle[4],
+ int nr_channels);
void
struct lp_build_blend_aos_context
{
struct lp_build_context base;
-
+
LLVMValueRef src;
+ LLVMValueRef src_alpha;
LLVMValueRef dst;
LLVMValueRef const_;
+ LLVMValueRef const_alpha;
LLVMValueRef inv_src;
+ LLVMValueRef inv_src_alpha;
LLVMValueRef inv_dst;
LLVMValueRef inv_const;
+ LLVMValueRef inv_const_alpha;
LLVMValueRef saturate;
LLVMValueRef rgb_src_factor;
unsigned factor,
boolean alpha)
{
+ LLVMValueRef src_alpha = bld->src_alpha ? bld->src_alpha : bld->src;
+ LLVMValueRef const_alpha = bld->const_alpha ? bld->const_alpha : bld->const_;
+
switch (factor) {
case PIPE_BLENDFACTOR_ZERO:
return bld->base.zero;
case PIPE_BLENDFACTOR_ONE:
return bld->base.one;
case PIPE_BLENDFACTOR_SRC_COLOR:
- case PIPE_BLENDFACTOR_SRC_ALPHA:
return bld->src;
+ case PIPE_BLENDFACTOR_SRC_ALPHA:
+ return src_alpha;
case PIPE_BLENDFACTOR_DST_COLOR:
case PIPE_BLENDFACTOR_DST_ALPHA:
return bld->dst;
if(!bld->inv_dst)
bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
if(!bld->saturate)
- bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
+ bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst);
return bld->saturate;
}
case PIPE_BLENDFACTOR_CONST_COLOR:
- case PIPE_BLENDFACTOR_CONST_ALPHA:
return bld->const_;
+ case PIPE_BLENDFACTOR_CONST_ALPHA:
+ return const_alpha;
case PIPE_BLENDFACTOR_SRC1_COLOR:
case PIPE_BLENDFACTOR_SRC1_ALPHA:
/* TODO */
assert(0);
return bld->base.zero;
case PIPE_BLENDFACTOR_INV_SRC_COLOR:
- case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
if(!bld->inv_src)
bld->inv_src = lp_build_comp(&bld->base, bld->src);
return bld->inv_src;
+ case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+ if(!bld->inv_src_alpha)
+ bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha);
+ return bld->inv_src_alpha;
case PIPE_BLENDFACTOR_INV_DST_COLOR:
case PIPE_BLENDFACTOR_INV_DST_ALPHA:
if(!bld->inv_dst)
bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
return bld->inv_dst;
case PIPE_BLENDFACTOR_INV_CONST_COLOR:
- case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
if(!bld->inv_const)
bld->inv_const = lp_build_comp(&bld->base, bld->const_);
return bld->inv_const;
+ case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+ if(!bld->inv_const_alpha)
+ bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
+ return bld->inv_const_alpha;
case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
/* TODO */
LLVMValueRef rgb,
LLVMValueRef alpha,
enum lp_build_blend_swizzle rgb_swizzle,
- unsigned alpha_swizzle)
+ unsigned alpha_swizzle,
+ unsigned num_channels)
{
LLVMValueRef swizzled_rgb;
swizzled_rgb = rgb;
break;
case LP_BUILD_BLEND_SWIZZLE_AAAA:
- swizzled_rgb = lp_build_swizzle_scalar_aos(&bld->base, rgb, alpha_swizzle);
+ swizzled_rgb = lp_build_swizzle_scalar_aos(&bld->base, rgb, alpha_swizzle, num_channels);
break;
default:
assert(0);
if (rgb != alpha) {
swizzled_rgb = lp_build_select_aos(&bld->base, 1 << alpha_swizzle,
- alpha, swizzled_rgb);
+ alpha, swizzled_rgb,
+ num_channels);
}
return swizzled_rgb;
}
-
/**
* @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
*/
lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
unsigned rgb_factor,
unsigned alpha_factor,
- unsigned alpha_swizzle)
+ unsigned alpha_swizzle,
+ unsigned num_channels)
{
LLVMValueRef rgb_factor_, alpha_factor_;
enum lp_build_blend_swizzle rgb_swizzle;
+ if (alpha_swizzle == 0) {
+ return lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
+ }
+
rgb_factor_ = lp_build_blend_factor_unswizzled(bld, rgb_factor, FALSE);
if (alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor);
alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
- return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
+ return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle, num_channels);
} else {
return rgb_factor_;
}
struct lp_type type,
unsigned rt,
LLVMValueRef src,
+ LLVMValueRef src_alpha,
LLVMValueRef dst,
LLVMValueRef mask,
LLVMValueRef const_,
- const unsigned char swizzle[4])
+ LLVMValueRef const_alpha,
+ const unsigned char swizzle[4],
+ int nr_channels)
{
const struct pipe_rt_blend_state * state = &blend->rt[rt];
const struct util_format_description * desc;
struct lp_build_blend_aos_context bld;
LLVMValueRef src_factor, dst_factor;
LLVMValueRef result;
- unsigned alpha_swizzle = swizzle[3];
- boolean fullcolormask;
+ unsigned alpha_swizzle = UTIL_FORMAT_SWIZZLE_NONE;
+ unsigned i;
desc = util_format_description(cbuf_format[rt]);
bld.src = src;
bld.dst = dst;
bld.const_ = const_;
-
- if (swizzle[3] > UTIL_FORMAT_SWIZZLE_W || swizzle[3] == swizzle[0])
- alpha_swizzle = UTIL_FORMAT_SWIZZLE_NONE;
+ bld.src_alpha = src_alpha;
+ bld.const_alpha = const_alpha;
+
+ /* Find the alpha channel if not provided seperately */
+ if (!src_alpha) {
+ for (i = 0; i < 4; ++i) {
+ if (swizzle[i] == 3) {
+ alpha_swizzle = i;
+ }
+ }
+ }
if (!state->blend_enable) {
result = src;
} else {
- boolean rgb_alpha_same = state->rgb_src_factor == state->rgb_dst_factor && state->alpha_src_factor == state->alpha_dst_factor;
- assert(rgb_alpha_same || alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+ boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor && state->alpha_src_factor == state->alpha_dst_factor) || nr_channels == 1;
src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
- state->alpha_src_factor, alpha_swizzle);
+ state->alpha_src_factor,
+ alpha_swizzle,
+ nr_channels);
+
dst_factor = lp_build_blend_factor(&bld, state->rgb_dst_factor,
- state->alpha_dst_factor, alpha_swizzle);
+ state->alpha_dst_factor,
+ alpha_swizzle,
+ nr_channels);
result = lp_build_blend(&bld.base,
state->rgb_func,
rgb_alpha_same,
false);
- if(state->rgb_func != state->alpha_func && alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
+ if(state->rgb_func != state->alpha_func && nr_channels > 1 && alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
LLVMValueRef alpha;
alpha = lp_build_blend(&bld.base,
result,
alpha,
LP_BUILD_BLEND_SWIZZLE_RGBA,
- alpha_swizzle);
+ alpha_swizzle,
+ nr_channels);
}
}
/* Check if color mask is necessary */
- fullcolormask = util_format_colormask_full(util_format_description(cbuf_format[rt]), state->colormask);
-
- if (!fullcolormask) {
+ if (!util_format_colormask_full(desc, state->colormask)) {
LLVMValueRef color_mask;
- color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, state.colormask, desc->nr_channels, swizzle);
+ color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, state->colormask, nr_channels, swizzle);
lp_build_name(color_mask, "color_mask");
/* Combine with input mask if necessary */
if (mask) {
+ /* We can be blending floating values but masks are always integer... */
+ unsigned floating = bld.base.type.floating;
+ bld.base.type.floating = 0;
+
mask = lp_build_and(&bld.base, color_mask, mask);
+
+ bld.base.type.floating = floating;
} else {
mask = color_mask;
}
elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatTypeInContext(lc);
elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] =
elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32TypeInContext(lc);
- elem_types[LP_JIT_CTX_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0);
+ elem_types[LP_JIT_CTX_U8_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0);
+ elem_types[LP_JIT_CTX_F_BLEND_COLOR] = LLVMPointerType(LLVMFloatTypeInContext(lc), 0);
elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type,
PIPE_MAX_SAMPLERS);
LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back,
gallivm->target, context_type,
LP_JIT_CTX_STENCIL_REF_BACK);
- LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
+ LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, u8_blend_color,
gallivm->target, context_type,
- LP_JIT_CTX_BLEND_COLOR);
+ LP_JIT_CTX_U8_BLEND_COLOR);
+ LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, f_blend_color,
+ gallivm->target, context_type,
+ LP_JIT_CTX_F_BLEND_COLOR);
LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
gallivm->target, context_type,
LP_JIT_CTX_TEXTURES);
uint32_t stencil_ref_front, stencil_ref_back;
- /* FIXME: store (also?) in floats */
- uint8_t *blend_color;
+ uint8_t *u8_blend_color;
+ float *f_blend_color;
struct lp_jit_texture textures[PIPE_MAX_SAMPLERS];
};
LP_JIT_CTX_ALPHA_REF,
LP_JIT_CTX_STENCIL_REF_FRONT,
LP_JIT_CTX_STENCIL_REF_BACK,
- LP_JIT_CTX_BLEND_COLOR,
+ LP_JIT_CTX_U8_BLEND_COLOR,
+ LP_JIT_CTX_F_BLEND_COLOR,
LP_JIT_CTX_TEXTURES,
LP_JIT_CTX_COUNT
};
#define lp_jit_context_stencil_ref_back_value(_gallivm, _ptr) \
lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back")
-#define lp_jit_context_blend_color(_gallivm, _ptr) \
- lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_BLEND_COLOR, "blend_color")
+#define lp_jit_context_u8_blend_color(_gallivm, _ptr) \
+ lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_U8_BLEND_COLOR, "u8_blend_color")
+
+#define lp_jit_context_f_blend_color(_gallivm, _ptr) \
+ lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_F_BLEND_COLOR, "f_blend_color")
#define lp_jit_context_textures(_gallivm, _ptr) \
lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_TEXTURES, "textures")
+/**
+ * typedef for fragment shader function
+ *
+ * @param context jit context
+ * @param x block start x
+ * @param y block start y
+ * @param facing is front facing
+ * @param a0 shader input a0
+ * @param dadx shader input dadx
+ * @param dady shader input dady
+ * @param color color buffer
+ * @param depth depth buffer
+ * @param mask mask of visible pixels in block
+ * @param thread_data task thread data
+ * @param stride color buffer row stride in bytes
+ */
typedef void
(*lp_jit_frag_func)(const struct lp_jit_context *context,
uint32_t x,
uint8_t **color,
void *depth,
uint32_t mask,
- uint32_t *counter);
+ uint32_t *counter,
+ unsigned *stride);
void
const union lp_rast_cmd_arg arg)
{
const struct lp_scene *scene = task->scene;
- const uint8_t *clear_color = arg.clear_color;
+ uint8_t clear_color[4];
unsigned i;
+ boolean gray;
- LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__,
+ for (i = 0; i < 4; ++i) {
+ clear_color[i] = float_to_ubyte(arg.clear_color[i]);
+ }
+
+ LP_DBG(DEBUG_RAST, "%s 0x%x,0x%x,0x%x,0x%x\n", __FUNCTION__,
clear_color[0],
clear_color[1],
clear_color[2],
clear_color[3]);
- if (clear_color[0] == clear_color[1] &&
- clear_color[1] == clear_color[2] &&
- clear_color[2] == clear_color[3]) {
- /* clear to grayscale value {x, x, x, x} */
- for (i = 0; i < scene->fb.nr_cbufs; i++) {
- uint8_t *ptr =
- lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
- memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
- }
- }
- else {
- /* Non-gray color.
- * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code
- * will need to change. It'll be pretty obvious when clearing no longer
- * works.
- */
- const unsigned chunk = TILE_SIZE / 4;
- for (i = 0; i < scene->fb.nr_cbufs; i++) {
- uint8_t *c =
- lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+ gray =
+ clear_color[0] == clear_color[1] &&
+ clear_color[1] == clear_color[2] &&
+ clear_color[2] == clear_color[3];
+
+ for (i = 0; i < scene->fb.nr_cbufs; i++) {
+ if (scene->cbufs[i].unswizzled) {
+ const struct lp_scene *scene = task->scene;
+ union util_color uc;
+
+ util_pack_color(arg.clear_color,
+ scene->fb.cbufs[i]->format, &uc);
+
+ util_fill_rect(scene->cbufs[i].map,
+ scene->fb.cbufs[i]->format,
+ scene->cbufs[i].stride,
+ task->x,
+ task->y,
+ TILE_SIZE,
+ TILE_SIZE,
+ &uc);
+ } else {
+ const unsigned chunk = TILE_SIZE / 4;
+ uint8_t *ptr;
unsigned j;
- for (j = 0; j < 4 * TILE_SIZE; j++) {
- memset(c, clear_color[0], chunk);
- c += chunk;
- memset(c, clear_color[1], chunk);
- c += chunk;
- memset(c, clear_color[2], chunk);
- c += chunk;
- memset(c, clear_color[3], chunk);
- c += chunk;
+ ptr = lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+
+ if (gray) {
+ /* clear to grayscale value {x, x, x, x} */
+
+ memset(ptr, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
+ } else {
+ /* Non-gray color.
+ * Note: if the swizzled tile layout changes (see TILE_PIXEL) this code
+ * will need to change. It'll be pretty obvious when clearing no longer
+ * works.
+ */
+
+ for (j = 0; j < 4 * TILE_SIZE; j++) {
+ memset(ptr, clear_color[0], chunk);
+ ptr += chunk;
+ memset(ptr, clear_color[1], chunk);
+ ptr += chunk;
+ memset(ptr, clear_color[2], chunk);
+ ptr += chunk;
+ memset(ptr, clear_color[3], chunk);
+ ptr += chunk;
+ }
}
}
}
const unsigned level = cbuf->u.tex.level;
struct llvmpipe_resource *lpt = llvmpipe_resource(cbuf->texture);
- if (!task->color_tiles[buf])
+ if (scene->cbufs[buf].unswizzled || !task->color_tiles[buf])
continue;
llvmpipe_unswizzle_cbuf_tile(lpt,
for (y = 0; y < TILE_SIZE; y += 4){
for (x = 0; x < TILE_SIZE; x += 4) {
uint8_t *color[PIPE_MAX_COLOR_BUFS];
+ unsigned stride[PIPE_MAX_COLOR_BUFS];
uint32_t *depth;
unsigned i;
/* color buffer */
- for (i = 0; i < scene->fb.nr_cbufs; i++)
- color[i] = lp_rast_get_color_block_pointer(task, i,
- tile_x + x, tile_y + y);
+ for (i = 0; i < scene->fb.nr_cbufs; i++){
+ stride[i] = scene->cbufs[i].stride;
+
+ if (scene->cbufs[i].unswizzled) {
+ color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, tile_x + x, tile_y + y);
+ } else {
+ color[i] = lp_rast_get_color_block_pointer(task, i, tile_x + x, tile_y + y);
+ }
+ }
/* depth buffer */
depth = lp_rast_get_depth_block_pointer(task, tile_x + x, tile_y + y);
color,
depth,
0xffff,
- &task->vis_counter);
+ &task->vis_counter,
+ stride);
END_JIT_CALL();
}
}
/* this will prevent converting the layout from tiled to linear */
for (i = 0; i < scene->fb.nr_cbufs; i++) {
- (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+ if (!scene->cbufs[i].unswizzled) {
+ (void)lp_rast_get_color_tile_pointer(task, i, LP_TEX_USAGE_WRITE_ALL);
+ }
}
lp_rast_shade_tile(task, arg);
struct lp_fragment_shader_variant *variant = state->variant;
const struct lp_scene *scene = task->scene;
uint8_t *color[PIPE_MAX_COLOR_BUFS];
+ unsigned stride[PIPE_MAX_COLOR_BUFS];
void *depth;
unsigned i;
/* color buffer */
for (i = 0; i < scene->fb.nr_cbufs; i++) {
- color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
- assert(lp_check_alignment(color[i], 16));
+ stride[i] = scene->cbufs[i].stride;
+
+ if (scene->cbufs[i].unswizzled) {
+ color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, x, y);
+ } else {
+ color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+ }
}
/* depth buffer */
depth = lp_rast_get_depth_block_pointer(task, x, y);
- assert(lp_check_alignment(state->jit_context.blend_color, 16));
+ assert(lp_check_alignment(state->jit_context.u8_blend_color, 16));
/* run shader on 4x4 block */
BEGIN_JIT_CALL(state, task);
color,
depth,
mask,
- &task->vis_counter);
+ &task->vis_counter,
+ stride);
END_JIT_CALL();
}
unsigned plane_mask;
} triangle;
const struct lp_rast_state *set_state;
- uint8_t clear_color[4];
+ float clear_color[4];
struct {
uint32_t value;
uint32_t mask;
assert(task->x % TILE_SIZE == 0);
assert(task->y % TILE_SIZE == 0);
assert(buf < scene->fb.nr_cbufs);
+ assert(scene->cbufs[buf].unswizzled == 0);
if (!task->color_tiles[buf]) {
struct pipe_surface *cbuf = scene->fb.cbufs[buf];
}
+/**
+ * Get pointer to the unswizzled color tile
+ */
+static INLINE uint8_t *
+lp_rast_get_unswizzled_color_tile_pointer(struct lp_rasterizer_task *task,
+ unsigned buf, enum lp_texture_usage usage)
+{
+ const struct lp_scene *scene = task->scene;
+ unsigned format_bytes;
+
+ assert(task->x < scene->tiles_x * TILE_SIZE);
+ assert(task->y < scene->tiles_y * TILE_SIZE);
+ assert(task->x % TILE_SIZE == 0);
+ assert(task->y % TILE_SIZE == 0);
+ assert(buf < scene->fb.nr_cbufs);
+ assert(scene->cbufs[buf].unswizzled);
+
+ if (!task->color_tiles[buf]) {
+ struct pipe_surface *cbuf = scene->fb.cbufs[buf];
+ assert(cbuf);
+
+ format_bytes = util_format_description(cbuf->format)->block.bits / 8;
+ task->color_tiles[buf] = scene->cbufs[buf].map + scene->cbufs[buf].stride * task->y + format_bytes * task->x;
+ }
+
+ return task->color_tiles[buf];
+}
+
+
/**
* Get the pointer to a 4x4 color block (within a 64x64 tile).
* We'll map the color buffer on demand here.
assert(y < task->scene->tiles_y * TILE_SIZE);
assert((x % TILE_VECTOR_WIDTH) == 0);
assert((y % TILE_VECTOR_HEIGHT) == 0);
+ assert(buf < task->scene->fb.nr_cbufs);
+ assert(task->scene->cbufs[buf].unswizzled == 0);
color = lp_rast_get_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE);
assert(color);
}
+/**
+ * Get the pointer to an unswizzled 4x4 color block (within an unswizzled 64x64 tile).
+ * \param x, y location of 4x4 block in window coords
+ */
+static INLINE uint8_t *
+lp_rast_get_unswizzled_color_block_pointer(struct lp_rasterizer_task *task,
+ unsigned buf, unsigned x, unsigned y)
+{
+ unsigned px, py, pixel_offset, format_bytes;
+ uint8_t *color;
+
+ assert(x < task->scene->tiles_x * TILE_SIZE);
+ assert(y < task->scene->tiles_y * TILE_SIZE);
+ assert((x % TILE_VECTOR_WIDTH) == 0);
+ assert((y % TILE_VECTOR_HEIGHT) == 0);
+ assert(buf < task->scene->fb.nr_cbufs);
+ assert(task->scene->cbufs[buf].unswizzled);
+
+ format_bytes = util_format_description(task->scene->fb.cbufs[buf]->format)->block.bits / 8;
+
+ color = lp_rast_get_unswizzled_color_tile_pointer(task, buf, LP_TEX_USAGE_READ_WRITE);
+ assert(color);
+
+ px = x % TILE_SIZE;
+ py = y % TILE_SIZE;
+ pixel_offset = px * format_bytes + py * task->scene->cbufs[buf].stride;
+
+ color = color + pixel_offset;
+
+ assert(lp_check_alignment(color, llvmpipe_get_format_alignment(task->scene->fb.cbufs[buf]->format)));
+ return color;
+}
+
+
/**
* Shade all pixels in a 4x4 block. The fragment code omits the
const struct lp_rast_state *state = task->state;
struct lp_fragment_shader_variant *variant = state->variant;
uint8_t *color[PIPE_MAX_COLOR_BUFS];
+ unsigned stride[PIPE_MAX_COLOR_BUFS];
void *depth;
unsigned i;
/* color buffer */
- for (i = 0; i < scene->fb.nr_cbufs; i++)
- color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+ for (i = 0; i < scene->fb.nr_cbufs; i++) {
+ stride[i] = scene->cbufs[i].stride;
+
+ if (scene->cbufs[i].unswizzled) {
+ color[i] = lp_rast_get_unswizzled_color_block_pointer(task, i, x, y);
+ } else {
+ color[i] = lp_rast_get_color_block_pointer(task, i, x, y);
+ }
+ }
depth = lp_rast_get_depth_block_pointer(task, x, y);
color,
depth,
0xffff,
- &task->vis_counter );
+ &task->vis_counter,
+ stride );
END_JIT_CALL();
}
cbuf->u.tex.first_layer,
LP_TEX_USAGE_READ_WRITE,
LP_TEX_LAYOUT_LINEAR);
+
+ scene->cbufs[i].unswizzled = llvmpipe_is_format_unswizzled(cbuf->format);
}
if (fb->zsbuf) {
uint8_t *map;
unsigned stride;
unsigned blocksize;
+ unsigned unswizzled;
} zsbuf, cbufs[PIPE_MAX_COLOR_BUFS];
/** the framebuffer to render the scene into */
if (flags & PIPE_CLEAR_COLOR) {
for (i = 0; i < 4; i++)
- color_arg.clear_color[i] = float_to_ubyte(color[i]);
+ color_arg.clear_color[i] = color[i];
}
if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
if(setup->dirty & LP_SETUP_NEW_BLEND_COLOR) {
uint8_t *stored;
+ float* fstored;
unsigned i, j;
+ unsigned size;
+
+ /* Alloc u8_blend_color (16 x i8) and f_blend_color (4 or 8 x f32) */
+ size = 4 * 16 * sizeof(uint8_t);
+ size += (LP_MAX_VECTOR_LENGTH / 4) * sizeof(float);
+ stored = lp_scene_alloc_aligned(scene, size, LP_MAX_VECTOR_LENGTH);
- stored = lp_scene_alloc_aligned(scene, 4 * 16, 16);
if (!stored) {
assert(!new_scene);
return FALSE;
}
+ /* Store floating point colour */
+ fstored = (float*)(stored + 4*16);
+ for (i = 0; i < (LP_MAX_VECTOR_LENGTH / 4); ++i) {
+ fstored[i] = setup->blend_color.current.color[i % 4];
+ }
+
/* smear each blend color component across 16 ubyte elements */
for (i = 0; i < 4; ++i) {
uint8_t c = float_to_ubyte(setup->blend_color.current.color[i]);
}
setup->blend_color.stored = stored;
- setup->fs.current.jit_context.blend_color = setup->blend_color.stored;
+ setup->fs.current.jit_context.u8_blend_color = stored;
+ setup->fs.current.jit_context.f_blend_color = fstored;
setup->dirty |= LP_SETUP_NEW_FS;
}
#include "gallivm/lp_bld_swizzle.h"
#include "gallivm/lp_bld_flow.h"
#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_pack.h"
+#include "gallivm/lp_bld_format.h"
+#include "gallivm/lp_bld_quad.h"
#include "lp_bld_alpha.h"
#include "lp_bld_blend.h"
vec_type = lp_build_vec_type(gallivm, type);
- const_ptr = lp_jit_context_blend_color(gallivm, context_ptr);
+ const_ptr = lp_jit_context_u8_blend_color(gallivm, context_ptr);
const_ptr = LLVMBuildBitCast(builder, const_ptr,
LLVMPointerType(vec_type, 0), "");
}
+/**
+ * This function will reorder pixels from the fragment shader SoA to memory layout AoS
+ *
+ * Fragment Shader outputs pixels in small 2x2 blocks
+ * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
+ *
+ * However in memory pixels are stored in rows
+ * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
+ *
+ * @param type fragment shader type (4x or 8x float)
+ * @param num_fs number of fs_src
+ * @param dst_channels number of output channels
+ * @param fs_src output from fragment shader
+ * @param dst pointer to store result
+ * @param pad_inline is channel padding inline or at end of row
+ * @return the number of dsts
+ */
+static int
+generate_fs_twiddle(struct gallivm_state *gallivm,
+ struct lp_type type,
+ unsigned num_fs,
+ unsigned dst_channels,
+ LLVMValueRef fs_src[][4],
+ LLVMValueRef* dst,
+ bool pad_inline)
+{
+ LLVMValueRef src[16];
+
+ bool swizzle_pad;
+ bool twiddle;
+ bool split;
+
+ unsigned pixels = num_fs == 4 ? 1 : 2;
+ unsigned reorder_group;
+ unsigned src_channels;
+ unsigned src_count;
+ unsigned i;
+
+ src_channels = dst_channels < 3 ? dst_channels : 4;
+ src_count = num_fs * src_channels;
+
+ assert(pixels == 2 || num_fs == 4);
+ assert(num_fs * src_channels <= Elements(src));
+
+ /*
+ * Transpose from SoA -> AoS
+ */
+ for (i = 0; i < num_fs; ++i) {
+ lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]);
+ }
+
+ /*
+ * Pick transformation options
+ */
+ swizzle_pad = false;
+ twiddle = false;
+ split = false;
+ reorder_group = 0;
+
+ if (dst_channels == 1) {
+ twiddle = true;
+
+ if (pixels == 2) {
+ split = true;
+ }
+ } else if (dst_channels == 2) {
+ if (pixels == 1) {
+ reorder_group = 1;
+ }
+ } else if (dst_channels > 2) {
+ if (pixels == 1) {
+ reorder_group = 2;
+ } else {
+ twiddle = true;
+ }
+
+ if (!pad_inline && dst_channels == 3 && pixels > 1) {
+ swizzle_pad = true;
+ }
+ }
+
+ /*
+ * Split the src in half
+ */
+ if (split) {
+ for (i = num_fs; i > 0; --i) {
+ src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
+ src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
+ }
+
+ src_count *= 2;
+ type.length = 4;
+ }
+
+ /*
+ * Ensure pixels are in memory order
+ */
+ if (reorder_group) {
+ /* Twiddle pixels by reordering the array, e.g.:
+ *
+ * src_count = 8 -> 0 2 1 3 4 6 5 7
+ * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
+ */
+ const unsigned reorder_sw[] = { 0, 2, 1, 3 };
+
+ for (i = 0; i < src_count; ++i) {
+ unsigned group = i / reorder_group;
+ unsigned block = (group / 4) * 4 * reorder_group;
+ unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
+ dst[i] = src[j];
+ }
+ } else if (twiddle) {
+ /* Twiddle pixels across elements of array */
+ lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
+ } else {
+ /* Do nothing */
+ memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
+ }
+
+ /*
+ * Moves any padding between pixels to the end
+ * e.g. RGBXRGBX -> RGBRGBXX
+ */
+ if (swizzle_pad) {
+ unsigned char swizzles[16];
+ unsigned elems = pixels * dst_channels;
+
+ for (i = 0; i < type.length; ++i) {
+ if (i < elems)
+ swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
+ else
+ swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
+ }
+
+ for (i = 0; i < src_count; ++i) {
+ dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length);
+ }
+ }
+
+ return src_count;
+}
+
+
+/**
+ * Load an unswizzled block of pixels from memory
+ */
+static void
+load_unswizzled_block(struct gallivm_state *gallivm,
+ LLVMValueRef base_ptr,
+ LLVMValueRef stride,
+ unsigned block_width,
+ unsigned block_height,
+ LLVMValueRef* dst,
+ struct lp_type dst_type,
+ unsigned dst_count)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned row_size = dst_count / block_height;
+ unsigned i;
+
+ /* Ensure block exactly fits into dst */
+ assert((block_width * block_height) % dst_count == 0);
+
+ for (i = 0; i < dst_count; ++i) {
+ unsigned x = i % row_size;
+ unsigned y = i / row_size;
+
+ LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
+ LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+ LLVMValueRef gep[2];
+ LLVMValueRef dst_ptr;
+
+ gep[0] = lp_build_const_int32(gallivm, 0);
+ gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+ dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+ dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+
+ dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
+
+ if ((dst_type.length % 3) == 0) {
+ lp_set_load_alignment(dst[i], dst_type.width / 8);
+ }
+ }
+}
+
+
+/**
+ * Store an unswizzled block of pixels to memory
+ */
+static void
+store_unswizzled_block(struct gallivm_state *gallivm,
+ LLVMValueRef base_ptr,
+ LLVMValueRef stride,
+ unsigned block_width,
+ unsigned block_height,
+ LLVMValueRef* src,
+ struct lp_type src_type,
+ unsigned src_count)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned row_size = src_count / block_height;
+ unsigned i;
+
+ /* Ensure src exactly fits into block */
+ assert((block_width * block_height) % src_count == 0);
+
+ for (i = 0; i < src_count; ++i) {
+ unsigned x = i % row_size;
+ unsigned y = i / row_size;
+
+ LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
+ LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+ LLVMValueRef gep[2];
+ LLVMValueRef src_ptr;
+
+ gep[0] = lp_build_const_int32(gallivm, 0);
+ gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+ src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+ src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+
+ src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
+
+ if ((src_type.length % 3) == 0) {
+ lp_set_store_alignment(src_ptr, src_type.width / 8);
+ }
+ }
+}
+
+
+/**
+ * Checks if a format description is an arithmetic format
+ *
+ * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
+ */
+static INLINE boolean
+is_arithmetic_format(const struct util_format_description *format_desc)
+{
+ boolean arith = false;
+ unsigned i;
+
+ for (i = 0; i < format_desc->nr_channels; ++i) {
+ arith |= format_desc->channel[i].size != format_desc->channel[0].size;
+ arith |= (format_desc->channel[i].size % 8) != 0;
+ }
+
+ return arith;
+}
+
+
+/**
+ * Retrieves the type representing the memory layout for a format
+ *
+ * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
+ */
+static INLINE void
+lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
+ struct lp_type* type)
+{
+ int i;
+
+ memset(type, 0, sizeof(struct lp_type));
+ type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+ type->fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+ type->sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+ type->norm = format_desc->channel[0].normalized;
+
+ if (is_arithmetic_format(format_desc)) {
+ type->width = 0;
+ type->length = 1;
+
+ for (i = 0; i < format_desc->nr_channels; ++i) {
+ type->width += format_desc->channel[i].size;
+ }
+ } else {
+ type->width = format_desc->channel[0].size;
+ type->length = format_desc->nr_channels;
+ }
+}
+
+
+/**
+ * Retrieves the type for a format which is usable in the blending code.
+ *
+ * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
+ */
+static INLINE void
+lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
+ struct lp_type* type)
+{
+ int i;
+
+ memset(type, 0, sizeof(struct lp_type));
+ type->floating = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT;
+ type->fixed = format_desc->channel[0].type == UTIL_FORMAT_TYPE_FIXED;
+ type->sign = format_desc->channel[0].type != UTIL_FORMAT_TYPE_UNSIGNED;
+ type->norm = format_desc->channel[0].normalized;
+ type->width = format_desc->channel[0].size;
+ type->length = format_desc->nr_channels;
+
+ for (i = 1; i < format_desc->nr_channels; ++i) {
+ if (format_desc->channel[i].size > type->width)
+ type->width = format_desc->channel[i].size;
+ }
+
+ if (type->floating) {
+ type->width = 32;
+ } else {
+ if (type->width <= 8) {
+ type->width = 8;
+ } else if (type->width <= 16) {
+ type->width = 16;
+ } else {
+ type->width = 32;
+ }
+ }
+
+ if (is_arithmetic_format(format_desc) && type->length == 3) {
+ type->length = 4;
+ }
+}
+
+
+/**
+ * Scale a normalised value from src_bits to dst_bits
+ */
+static INLINE LLVMValueRef
+scale_bits(struct gallivm_state *gallivm,
+ int src_bits,
+ int dst_bits,
+ LLVMValueRef src,
+ struct lp_type src_type)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef result = src;
+
+ if (dst_bits < src_bits) {
+ /* Scale down by LShr */
+ result = LLVMBuildLShr(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type, src_bits - dst_bits),
+ "");
+ } else if (dst_bits > src_bits) {
+ /* Scale up bits */
+ int db = dst_bits - src_bits;
+
+ /* Shift left by difference in bits */
+ result = LLVMBuildShl(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type, db),
+ "");
+
+ if (db < src_bits) {
+ /* Enough bits in src to fill the remainder */
+ LLVMValueRef lower = LLVMBuildLShr(builder,
+ src,
+ lp_build_const_int_vec(gallivm, src_type, src_bits - db),
+ "");
+
+ result = LLVMBuildOr(builder, result, lower, "");
+ } else if (db > src_bits) {
+ /* Need to repeatedely copy src bits to fill remainder in dst */
+ unsigned n;
+
+ for (n = src_bits; n < dst_bits; n *= 2) {
+ LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
+
+ result = LLVMBuildOr(builder,
+ result,
+ LLVMBuildLShr(builder, result, shuv, ""),
+ "");
+ }
+ }
+ }
+
+ return result;
+}
+
+
+/**
+ * Convert from memory format to blending format
+ *
+ * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
+ */
+static void
+convert_to_blend_type(struct gallivm_state *gallivm,
+ const struct util_format_description *src_fmt,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ LLVMValueRef* src,
+ unsigned num_srcs,
+ LLVMValueRef* dst)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_type blend_type;
+ struct lp_type mem_type;
+ unsigned i, j, k;
+ unsigned pixels = 16 / num_srcs;
+ bool is_arith;
+
+ memcpy(dst, src, sizeof(LLVMValueRef*) * num_srcs);
+
+ lp_mem_type_from_format_desc(src_fmt, &mem_type);
+ lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+ /* Is the format arithmetic */
+ is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
+ is_arith &= !(mem_type.width == 16 && mem_type.floating);
+
+ /* Pad if necessary */
+ if (!is_arith && src_type.length < dst_type.length) {
+ for (i = 0; i < num_srcs; ++i) {
+ dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
+ }
+
+ src_type.length = dst_type.length;
+ }
+
+ /* Special case for half-floats */
+ if (mem_type.width == 16 && mem_type.floating) {
+ assert(blend_type.width == 32 && blend_type.floating);
+ lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+ is_arith = false;
+ }
+
+ if (!is_arith) {
+ return;
+ }
+
+ src_type.width = blend_type.width * blend_type.length;
+ blend_type.length *= pixels;
+ src_type.length *= pixels / (src_type.length / mem_type.length);
+
+ for (i = 0; i < num_srcs; ++i) {
+ LLVMValueRef chans[4];
+ LLVMValueRef res;
+ unsigned sa = 0;
+
+ dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+ for (j = 0; j < src_fmt->nr_channels; ++j) {
+ unsigned mask = 0;
+
+ for (k = 0; k < src_fmt->channel[j].size; ++k) {
+ mask |= 1 << k;
+ }
+
+ /* Extract bits from source */
+ chans[j] = LLVMBuildLShr(builder,
+ dst[i],
+ lp_build_const_int_vec(gallivm, src_type, sa),
+ "");
+
+ chans[j] = LLVMBuildAnd(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, mask),
+ "");
+
+ /* Scale bits */
+ chans[j] = scale_bits(gallivm, src_fmt->channel[j].size, blend_type.width, chans[j], src_type);
+
+ /* Insert bits into correct position */
+ chans[j] = LLVMBuildShl(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, j * blend_type.width),
+ "");
+
+ sa += src_fmt->channel[j].size;
+
+ if (j == 0) {
+ res = chans[j];
+ } else {
+ res = LLVMBuildOr(builder, res, chans[j], "");
+ }
+ }
+
+ dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
+ }
+}
+
+
+/**
+ * Convert from blending format to memory format
+ *
+ * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
+ */
+static void
+convert_from_blend_type(struct gallivm_state *gallivm,
+ const struct util_format_description *src_fmt,
+ struct lp_type src_type,
+ struct lp_type dst_type,
+ LLVMValueRef* src,
+ unsigned num_srcs,
+ LLVMValueRef* dst)
+{
+ unsigned i, j, k;
+ struct lp_type mem_type;
+ struct lp_type blend_type;
+ LLVMBuilderRef builder = gallivm->builder;
+ unsigned pixels = 16 / num_srcs;
+ bool is_arith;
+
+ memcpy(dst, src, sizeof(LLVMValueRef*) * num_srcs);
+
+ lp_mem_type_from_format_desc(src_fmt, &mem_type);
+ lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+ is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
+
+ /* Special case for half-floats */
+ if (mem_type.width == 16 && mem_type.floating) {
+ int length = dst_type.length;
+ assert(blend_type.width == 32 && blend_type.floating);
+
+ dst_type.length = src_type.length;
+
+ lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+
+ dst_type.length = length;
+ is_arith = false;
+ }
+
+ /* Remove any padding */
+ if (!is_arith && (src_type.length % mem_type.length)) {
+ src_type.length -= (src_type.length % mem_type.length);
+
+ for (i = 0; i < num_srcs; ++i) {
+ dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
+ }
+ }
+
+ /* No bit arithmitic to do */
+ if (!is_arith) {
+ return;
+ }
+
+ src_type.length = pixels;
+ src_type.width = blend_type.length * blend_type.width;
+ dst_type.length = pixels;
+
+ for (i = 0; i < num_srcs; ++i) {
+ LLVMValueRef chans[4];
+ LLVMValueRef res;
+ unsigned sa = 0;
+
+ dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+ for (j = 0; j < src_fmt->nr_channels; ++j) {
+ unsigned mask = 0;
+
+ assert(blend_type.width > src_fmt->channel[j].size);
+
+ for (k = 0; k < blend_type.width; ++k) {
+ mask |= 1 << k;
+ }
+
+ /* Extract bits */
+ chans[j] = LLVMBuildLShr(builder,
+ dst[i],
+ lp_build_const_int_vec(gallivm, src_type, j * blend_type.width),
+ "");
+
+ chans[j] = LLVMBuildAnd(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, mask),
+ "");
+
+ /* Scale down bits */
+ chans[j] = scale_bits(gallivm, blend_type.width, src_fmt->channel[j].size, chans[j], src_type);
+
+ /* Insert bits */
+ chans[j] = LLVMBuildShl(builder,
+ chans[j],
+ lp_build_const_int_vec(gallivm, src_type, sa),
+ "");
+
+ sa += src_fmt->channel[j].size;
+
+ if (j == 0) {
+ res = chans[j];
+ } else {
+ res = LLVMBuildOr(builder, res, chans[j], "");
+ }
+ }
+
+ assert (dst_type.width != 24);
+
+ dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
+ }
+}
+
+
+/**
+ * Generates the blend function for unswizzled colour buffers
+ * Also generates the read & write from colour buffer
+ */
+static void
+generate_unswizzled_blend(struct gallivm_state *gallivm,
+ unsigned rt,
+ struct lp_fragment_shader_variant *variant,
+ enum pipe_format out_format,
+ unsigned int num_fs,
+ struct lp_type fs_type,
+ LLVMValueRef* fs_mask,
+ LLVMValueRef fs_out_color[TGSI_NUM_CHANNELS][4],
+ LLVMValueRef context_ptr,
+ LLVMValueRef color_ptr,
+ LLVMValueRef stride,
+ unsigned partial_mask,
+ boolean do_branch)
+{
+ const unsigned alpha_channel = 3;
+ const unsigned block_width = 4;
+ const unsigned block_height = 4;
+ const unsigned block_size = block_width * block_height;
+ const unsigned lp_integer_vector_width = 128;
+
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
+ LLVMValueRef src_alpha[block_size];
+ LLVMValueRef src_mask[block_size];
+ LLVMValueRef src[block_size];
+ LLVMValueRef dst[block_size];
+ LLVMValueRef blend_color;
+ LLVMValueRef blend_alpha;
+ LLVMValueRef i32_zero;
+ LLVMValueRef check_mask;
+
+ struct lp_build_mask_context mask_ctx;
+ struct lp_type mask_type;
+ struct lp_type blend_type;
+ struct lp_type alpha_type;
+ struct lp_type row_type;
+ struct lp_type dst_type;
+
+ unsigned char swizzle[TGSI_NUM_CHANNELS];
+ unsigned vector_width;
+ unsigned dst_channels;
+ unsigned src_channels;
+ unsigned dst_count;
+ unsigned src_count;
+ unsigned i, j;
+
+ const struct util_format_description* out_format_desc = util_format_description(out_format);
+
+ bool pad_inline = is_arithmetic_format(out_format_desc);
+ bool has_alpha = false;
+
+ src_channels = TGSI_NUM_CHANNELS;
+ mask_type = lp_int32_vec4_type();
+ mask_type.length = fs_type.length;
+
+ /* Do not bother executing code when mask is empty.. */
+ if (do_branch) {
+ check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
+
+ for (i = 0; i < num_fs; ++i) {
+ check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
+ }
+
+ lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
+ lp_build_mask_check(&mask_ctx);
+ }
+
+ partial_mask |= !variant->opaque;
+ i32_zero = lp_build_const_int32(gallivm, 0);
+
+ /* Get type from output format */
+ lp_blend_type_from_format_desc(out_format_desc, &row_type);
+ lp_mem_type_from_format_desc(out_format_desc, &dst_type);
+
+ row_type.length = fs_type.length;
+ vector_width = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
+
+ /* Compute correct swizzle and count channels */
+ memset(swizzle, 0xFF, TGSI_NUM_CHANNELS);
+ dst_channels = 0;
+
+ for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
+ /* Ensure channel is used */
+ if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
+ continue;
+ }
+
+ /* Ensure not already written to (happens in case with GL_ALPHA) */
+ if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
+ continue;
+ }
+
+ /* Ensure we havn't already found all channels */
+ if (dst_channels >= out_format_desc->nr_channels) {
+ continue;
+ }
+
+ swizzle[out_format_desc->swizzle[i]] = i;
+ ++dst_channels;
+
+ if (i == alpha_channel) {
+ has_alpha = true;
+ }
+ }
+
+ /* If 3 channels then pad to include alpha for 4 element transpose */
+ if (dst_channels == 3 && !has_alpha) {
+ swizzle[3] = 3;
+
+ if (out_format_desc->nr_channels == 4) {
+ dst_channels = 4;
+ }
+ }
+
+ /*
+ * Load shader output
+ */
+ for (i = 0; i < num_fs; ++i) {
+ /* Always load alpha for use in blending */
+ LLVMValueRef alpha = LLVMBuildLoad(builder, fs_out_color[alpha_channel][i], "");
+
+ /* Load each channel */
+ for (j = 0; j < dst_channels; ++j) {
+ fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[swizzle[j]][i], "");
+ }
+
+ /* If 3 channels then pad to include alpha for 4 element transpose */
+ if (dst_channels == 3 && !has_alpha) {
+ fs_src[i][3] = alpha;
+ swizzle[3] = 3;
+ }
+
+ /* We split the row_mask and row_alpha as we want 128bit interleave */
+ if (fs_type.length == 8) {
+ src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
+ src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
+
+ src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
+ src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+ } else {
+ src_mask[i] = fs_mask[i];
+ src_alpha[i] = alpha;
+ }
+ }
+
+
+ /*
+ * Pixel twiddle from fragment shader order to memory order
+ */
+ src_count = generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, fs_src, src, pad_inline);
+ src_channels = dst_channels < 3 ? dst_channels : 4;
+ if (src_count != num_fs * src_channels) {
+ unsigned ds = src_count / (num_fs * src_channels);
+ row_type.length /= ds;
+ fs_type.length = row_type.length;
+ }
+
+ blend_type = row_type;
+ alpha_type = fs_type;
+ alpha_type.length = 4;
+ mask_type.length = 4;
+
+ /* Convert src to row_type */
+ src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
+
+ /* If the rows are not an SSE vector, combine them to become SSE size! */
+ if ((row_type.width * row_type.length) % 128) {
+ unsigned bits = row_type.width * row_type.length;
+ unsigned combined;
+
+ dst_count = src_count / (vector_width / bits);
+ combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
+
+ row_type.length *= combined;
+ src_count /= combined;
+
+ bits = row_type.width * row_type.length;
+ assert(bits == 128 || bits == 256);
+ }
+
+
+ /*
+ * Blend Colour conversion
+ */
+ blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
+ blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
+ blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
+
+ /* Convert */
+ lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
+
+ /* Extract alpha */
+ blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3));
+
+ /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
+ pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width;
+ if (pad_inline) {
+ /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
+ blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length);
+ } else {
+ /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
+ blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length);
+ }
+
+ /*
+ * Mask conversion
+ */
+ lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], 4, &src_mask[0]);
+
+ if (src_count < block_height) {
+ lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
+ } else if (src_count > block_height) {
+ for (i = src_count; i > 0; --i) {
+ unsigned pixels = block_size / src_count;
+ unsigned idx = i - 1;
+
+ src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
+ }
+ }
+
+ assert(mask_type.width == 32);
+
+ for (i = 0; i < src_count; ++i) {
+ unsigned pixels = block_size / src_count;
+ unsigned pixel_width = row_type.width * dst_channels;
+
+ if (pixel_width == 24) {
+ mask_type.width = 8;
+ mask_type.length = vector_width / mask_type.width;
+ } else {
+ mask_type.length = pixels;
+ mask_type.width = row_type.width * dst_channels;
+
+ src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+
+ mask_type.length *= dst_channels;
+ mask_type.width /= dst_channels;
+ }
+
+ src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+ src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
+ }
+
+ /*
+ * Alpha conversion
+ */
+ if (!has_alpha) {
+ unsigned length = row_type.length;
+ row_type.length = alpha_type.length;
+
+ /* Twiddle the alpha to match pixels */
+ lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, 4, src_alpha);
+
+ for (i = 0; i < 4; ++i) {
+ lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
+ }
+
+ alpha_type = row_type;
+ row_type.length = length;
+
+ /* If only one channel we can only need the single alpha value per pixel */
+ if (src_count == 1) {
+ assert(dst_channels == 1);
+
+ lp_build_concat_n(gallivm, alpha_type, src_alpha, 4, src_alpha, src_count);
+ } else {
+ /* If there are more srcs than rows then we need to split alpha up */
+ if (src_count > block_height) {
+ for (i = src_count; i > 0; --i) {
+ unsigned pixels = block_size / src_count;
+ unsigned idx = i - 1;
+
+ src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
+ }
+ }
+
+ /* If there is a src for each pixel broadcast the alpha across whole row */
+ if (src_count == block_size) {
+ for (i = 0; i < src_count; ++i) {
+ src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
+ }
+ } else {
+ unsigned pixels = block_size / src_count;
+ unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
+ unsigned alpha_span = 1;
+
+ /* Check if we need 2 src_alphas for our shuffles */
+ if (pixels > alpha_type.length) {
+ alpha_span = 2;
+ }
+
+ /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
+ for (i = 0; i < src_count; ++i) {
+ LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+ unsigned idx1 = i, idx2 = i;
+
+ if (alpha_span > 1){
+ idx1 *= alpha_span;
+ idx2 = idx1 + 1;
+ }
+
+ for (j = 0; j < row_type.length; ++j) {
+ if (j < pixels * channels) {
+ shuffles[j] = lp_build_const_int32(gallivm, j / channels);
+ } else {
+ shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+ }
+ }
+
+ src_alpha[i] = LLVMBuildShuffleVector(builder,
+ src_alpha[idx1],
+ src_alpha[idx2],
+ LLVMConstVector(shuffles, row_type.length),
+ "");
+ }
+ }
+ }
+ }
+
+
+ /*
+ * Load dst from memory
+ */
+ if (src_count < block_height) {
+ dst_count = block_height;
+ } else {
+ dst_count = src_count;
+ }
+
+ dst_type.length *= 16 / dst_count;
+
+ load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, dst, dst_type, dst_count);
+
+
+ /*
+ * Convert from dst/output format to src/blending format.
+ *
+ * This is necessary as we can only read 1 row from memory at a time,
+ * so the minimum dst_count will ever be at this point is 4.
+ *
+ * With, for example, R8 format you can have all 16 pixels in a 128 bit vector,
+ * this will take the 4 dsts and combine them into 1 src so we can perform blending
+ * on all 16 pixels in that single vector at once.
+ */
+ if (dst_count > src_count) {
+ lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count);
+ }
+
+ /*
+ * Blending
+ */
+ convert_to_blend_type(gallivm, out_format_desc, dst_type, row_type, dst, src_count, dst);
+
+ for (i = 0; i < src_count; ++i) {
+ dst[i] = lp_build_blend_aos(gallivm,
+ &variant->key.blend,
+ variant->key.cbuf_format,
+ row_type,
+ rt,
+ src[i],
+ has_alpha ? NULL : src_alpha[i],
+ dst[i],
+ partial_mask ? src_mask[i] : NULL,
+ blend_color,
+ has_alpha ? NULL : blend_alpha,
+ swizzle,
+ pad_inline ? 4 : dst_channels);
+ }
+
+ convert_from_blend_type(gallivm, out_format_desc, row_type, dst_type, dst, src_count, dst);
+
+ /* Split the blend rows back to memory rows */
+ if (dst_count > src_count) {
+ row_type.length = dst_type.length * (dst_count / src_count);
+
+ if (src_count == 1) {
+ dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
+ dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
+
+ row_type.length /= 2;
+ src_count *= 2;
+ }
+
+ dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2);
+ dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
+ dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
+ dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
+
+ row_type.length /= 2;
+ src_count *= 2;
+ }
+
+
+ /*
+ * Store blend result to memory
+ */
+ store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, dst, dst_type, dst_count);
+
+ if (do_branch) {
+ lp_build_mask_end(&mask_ctx);
+ }
+}
+
+
/**
* Generate the runtime callable function for the whole fragment pipeline.
* Note that the function which we generate operates on a block of 16
struct lp_type blend_type;
LLVMTypeRef fs_elem_type;
LLVMTypeRef blend_vec_type;
- LLVMTypeRef arg_types[11];
+ LLVMTypeRef arg_types[12];
LLVMTypeRef func_type;
LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
LLVMValueRef dadx_ptr;
LLVMValueRef dady_ptr;
LLVMValueRef color_ptr_ptr;
+ LLVMValueRef stride_ptr;
LLVMValueRef depth_ptr;
LLVMValueRef mask_input;
LLVMValueRef counter = NULL;
arg_types[8] = LLVMPointerType(int8_type, 0); /* depth */
arg_types[9] = int32_type; /* mask_input */
arg_types[10] = LLVMPointerType(int32_type, 0); /* counter */
+ arg_types[11] = LLVMPointerType(int32_type, 0); /* stride */
func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
arg_types, Elements(arg_types), 0);
color_ptr_ptr = LLVMGetParam(function, 7);
depth_ptr = LLVMGetParam(function, 8);
mask_input = LLVMGetParam(function, 9);
+ stride_ptr = LLVMGetParam(function, 11);
lp_build_name(context_ptr, "context");
lp_build_name(x, "x");
lp_build_name(color_ptr_ptr, "color_ptr_ptr");
lp_build_name(depth_ptr, "depth");
lp_build_name(mask_input, "mask_input");
+ lp_build_name(stride_ptr, "stride_ptr");
if (key->occlusion_count) {
counter = LLVMGetParam(function, 10);
LLVMValueRef color_ptr;
LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
LLVMValueRef blend_in_color[TGSI_NUM_CHANNELS];
- unsigned rt;
-
- /*
- * Convert the fs's output color and mask to fit to the blending type.
- */
- for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
- LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
-
- for (i = 0; i < num_fs; i++) {
- fs_color_vals[i] =
- LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
- }
-
- lp_build_conv(gallivm, fs_type, blend_type,
- fs_color_vals,
- num_fs,
- &blend_in_color[chan], 1);
+ unsigned rt = key->blend.independent_blend_enable ? cbuf : 0;
- lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
- }
-
- if (partial_mask || !variant->opaque) {
- lp_build_conv_mask(variant->gallivm, fs_type, blend_type,
- fs_mask, num_fs,
- &blend_mask, 1);
- } else {
- blend_mask = lp_build_const_int_vec(variant->gallivm, blend_type, ~0);
- }
+ boolean do_branch = ((key->depth.enabled
+ || key->stencil[0].enabled
+ || key->alpha.enabled)
+ && !shader->info.base.uses_kill);
- color_ptr = LLVMBuildLoad(builder,
+ color_ptr = LLVMBuildLoad(builder,
LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""),
"");
+
lp_build_name(color_ptr, "color_ptr%d", cbuf);
- /* which blend/colormask state to use */
- rt = key->blend.independent_blend_enable ? cbuf : 0;
+ if (variant->unswizzled_cbufs & (1 << cbuf)) {
+ LLVMValueRef stride = LLVMBuildLoad(builder,
+ LLVMBuildGEP(builder, stride_ptr, &index, 1, ""),
+ "");
- /*
- * Blending.
- */
- {
- /* Could the 4x4 have been killed?
+ generate_unswizzled_blend(gallivm, rt, variant, key->cbuf_format[cbuf],
+ num_fs, fs_type, fs_mask, fs_out_color[cbuf],
+ context_ptr, color_ptr, stride, partial_mask, do_branch);
+ } else {
+ /*
+ * Convert the fs's output color and mask to fit to the blending type.
*/
- boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
- !key->alpha.enabled &&
- !shader->info.base.uses_kill);
+ for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
+
+ for (i = 0; i < num_fs; i++) {
+ fs_color_vals[i] =
+ LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
+ }
+
+ lp_build_conv(gallivm, fs_type, blend_type,
+ fs_color_vals,
+ num_fs,
+ &blend_in_color[chan], 1);
+
+ lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
+ }
+
+ if (partial_mask || !variant->opaque) {
+ lp_build_conv_mask(gallivm, fs_type, blend_type,
+ fs_mask, num_fs,
+ &blend_mask, 1);
+ } else {
+ blend_mask = lp_build_const_int_vec(gallivm, blend_type, ~0);
+ }
- generate_blend(variant->gallivm,
+ generate_blend(gallivm,
&key->blend,
rt,
builder,
struct lp_fragment_shader_variant *variant;
const struct util_format_description *cbuf0_format_desc;
boolean fullcolormask;
+ unsigned i;
variant = CALLOC_STRUCT(lp_fragment_shader_variant);
if(!variant)
!shader->info.base.uses_kill
? TRUE : FALSE;
+ for (i = 0; i < key->nr_cbufs; ++i) {
+ variant->unswizzled_cbufs |= llvmpipe_is_format_unswizzled(key->cbuf_format[i]) << i;
+ }
if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
lp_debug_fs_variant(variant);
lp_jit_frag_func jit_function[2];
+ /* Bitmask to say what cbufs are unswizzled */
+ unsigned unswizzled_cbufs;
+
/* Total number of LLVM instructions generated */
unsigned nr_instrs;
dst = LLVMBuildLoad(builder, dst_ptr, "dst");
con = LLVMBuildLoad(builder, const_ptr, "const");
- res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, dst, NULL, con, swizzle);
+ res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, NULL, dst, NULL, con, NULL, swizzle, 4);
lp_build_name(res, "res");
return lp_setup_is_resource_referenced(llvmpipe->setup, presource);
}
+boolean
+llvmpipe_is_format_unswizzled( enum pipe_format format )
+{
+ const struct util_format_description *desc = util_format_description(format);
+ unsigned chan;
+
+ if (format == PIPE_FORMAT_B8G8R8X8_UNORM || format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+ return FALSE;
+ }
+
+ if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+ desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
+ desc->block.width != 1 ||
+ desc->block.height != 1) {
+ return FALSE;
+ }
+
+ for (chan = 0; chan < desc->nr_channels; ++chan) {
+ if (desc->channel[chan].type == UTIL_FORMAT_TYPE_VOID && (chan + 1) == desc->nr_channels)
+ continue;
+
+ if (desc->channel[chan].type != desc->channel[0].type)
+ return FALSE;
+
+ if (desc->channel[chan].normalized != desc->channel[0].normalized)
+ return FALSE;
+
+ if (desc->channel[chan].pure_integer != desc->channel[0].pure_integer)
+ return FALSE;
+ }
+
+ /* All code assumes alpha is the last channel */
+ if (desc->nr_channels == 4 && desc->swizzle[3] < 3) {
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+
+/**
+ * Returns the largest possible alignment for a format in llvmpipe
+ */
+unsigned
+llvmpipe_get_format_alignment( enum pipe_format format )
+{
+ const struct util_format_description *desc = util_format_description(format);
+ unsigned size = 0;
+ unsigned bytes;
+ unsigned i;
+
+ for (i = 0; i < desc->nr_channels; ++i) {
+ size += desc->channel[i].size;
+ }
+
+ bytes = size / 8;
+
+ if (!util_is_power_of_two(bytes)) {
+ bytes /= desc->nr_channels;
+ }
+
+ if (bytes % 2 || bytes < 1) {
+ return 1;
+ } else {
+ return bytes;
+ }
+}
/**
struct pipe_resource *presource,
unsigned level, int layer);
+boolean
+llvmpipe_is_format_unswizzled(enum pipe_format format);
+
+unsigned
+llvmpipe_get_format_alignment(enum pipe_format format);
+
#endif /* LP_TEXTURE_H */