struct lp_type mask_type;
LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
LLVMValueRef bits[16];
- LLVMValueRef mask;
+ LLVMValueRef mask, bits_vec;
int shift, i;
/*
bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
}
- mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
+ bits_vec = LLVMConstVector(bits, fs_type.length);
+ mask = LLVMBuildAnd(builder, mask, bits_vec, "");
/*
- * mask = mask != 0 ? ~0 : 0
+ * mask = mask == bits ? ~0 : 0
*/
mask = lp_build_compare(gallivm,
- mask_type, PIPE_FUNC_NOTEQUAL,
- mask,
- lp_build_const_int_vec(gallivm, mask_type, 0));
+ mask_type, PIPE_FUNC_EQUAL,
+ mask, bits_vec);
return mask;
}
}
} else if (twiddle) {
/* Twiddle pixels across elements of array */
+ /*
+ * XXX: we should avoid this in some cases, but would need to tell
+ * lp_build_conv to reorder (or deal with it ourselves).
+ */
lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
} else {
/* Do nothing */
}
+/*
+ * Untwiddle and transpose, much like the above.
+ * However, this is after conversion, so we get packed vectors.
+ * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
+ * the vectors will look like:
+ * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
+ * be swizzled here). Extending to 16bit should be trivial.
+ * Should also be extended to handle twice wide vectors with AVX2...
+ */
+static void
+fs_twiddle_transpose(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef *src,
+ unsigned src_count,
+ LLVMValueRef *dst)
+{
+ unsigned i, j;
+ struct lp_type type64, type16, type32;
+ LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef tmp[4], shuf[8];
+ for (j = 0; j < 2; j++) {
+ shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
+ shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
+ shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
+ shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
+ }
+
+ assert(src_count == 4 || src_count == 2 || src_count == 1);
+ assert(type.width == 8);
+ assert(type.length == 16);
+
+ type8_t = lp_build_vec_type(gallivm, type);
+
+ type64 = type;
+ type64.length /= 8;
+ type64.width *= 8;
+ type64_t = lp_build_vec_type(gallivm, type64);
+
+ type16 = type;
+ type16.length /= 2;
+ type16.width *= 2;
+ type16_t = lp_build_vec_type(gallivm, type16);
+
+ type32 = type;
+ type32.length /= 4;
+ type32.width *= 4;
+ type32_t = lp_build_vec_type(gallivm, type32);
+
+ lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
+
+ if (src_count == 1) {
+ /* transpose was no-op, just untwiddle */
+ LLVMValueRef shuf_vec;
+ shuf_vec = LLVMConstVector(shuf, 8);
+ tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
+ tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
+ dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
+ } else if (src_count == 2) {
+ LLVMValueRef shuf_vec;
+ shuf_vec = LLVMConstVector(shuf, 4);
+
+ for (i = 0; i < 2; i++) {
+ tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
+ tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
+ dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
+ }
+ } else {
+ for (j = 0; j < 2; j++) {
+ LLVMValueRef lo, hi, lo2, hi2;
+ /*
+ * Note that if we only really have 3 valid channels (rgb)
+ * and we don't need alpha we could substitute a undef here
+ * for the respective channel (causing llvm to drop conversion
+ * for alpha).
+ */
+ /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
+ lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
+ hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
+ lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
+ hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
+ dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
+ dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
+ }
+ }
+}
+
+
/**
* Load an unswizzled block of pixels from memory
*/
gep[1] = LLVMBuildAdd(builder, bx, by, "");
dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
- dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+ dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
+ LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
gep[1] = LLVMBuildAdd(builder, bx, by, "");
src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
- src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+ src_ptr = LLVMBuildBitCast(builder, src_ptr,
+ LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
lp_build_const_int_vec(gallivm, src_type, db),
"");
- if (db < src_bits) {
+ if (db <= src_bits) {
/* Enough bits in src to fill the remainder */
LLVMValueRef lower = LLVMBuildLShr(builder,
src,
LLVMBuilderRef builder = gallivm->builder;
struct lp_type blend_type;
struct lp_type mem_type;
- unsigned i, j, k;
+ unsigned i, j;
unsigned pixels = block_size / num_srcs;
bool is_arith;
unsigned from_lsb = src_fmt->nr_channels - j - 1;
#endif
- for (k = 0; k < src_fmt->channel[j].size; ++k) {
- mask |= 1 << k;
- }
+ mask = (1 << src_fmt->channel[j].size) - 1;
/* Extract bits from source */
chans[j] = LLVMBuildLShr(builder,
/* Extract bits */
chans[j] = LLVMBuildLShr(builder,
dst[i],
- lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
+ lp_build_const_int_vec(gallivm, src_type,
+ from_lsb * blend_type.width),
"");
chans[j] = LLVMBuildAnd(builder,
/* If there is a src for each pixel broadcast the alpha across whole row */
if (src_count == block_size) {
for (i = 0; i < src_count; ++i) {
- src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
+ src_alpha[i] = lp_build_broadcast(gallivm,
+ lp_build_vec_type(gallivm, row_type), src_alpha[i]);
}
} else {
unsigned pixels = block_size / src_count;
struct lp_type blend_type;
struct lp_type row_type;
struct lp_type dst_type;
+ struct lp_type ls_type;
unsigned char swizzle[TGSI_NUM_CHANNELS];
unsigned vector_width;
util_blend_state_is_dual(&variant->key.blend, 0);
const boolean is_1d = variant->key.resource_1d;
+ boolean twiddle_after_convert = FALSE;
unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
LLVMValueRef fpstate = 0;
}
/* If 3 channels then pad to include alpha for 4 element transpose */
- if (dst_channels == 3 && !has_alpha) {
+ if (dst_channels == 3) {
+ assert (!has_alpha);
for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
if (swizzle[i] > TGSI_NUM_CHANNELS)
swizzle[i] = 3;
}
if (out_format_desc->nr_channels == 4) {
dst_channels = 4;
+ /*
+ * We use alpha from the color conversion, not separate one.
+ * We had to include it for transpose, hence it will get converted
+ * too (albeit when doing transpose after conversion, that would
+ * no longer be the case necessarily).
+ * (It works only with 4 channel dsts, e.g. rgbx formats, because
+ * otherwise we really have padding, not alpha, included.)
+ */
+ has_alpha = true;
}
}
/*
* XXX If we include that here maybe could actually use it instead of
* separate alpha for blending?
+ * (Difficult though we actually convert pad channels, not alpha.)
*/
if (dst_channels == 3 && !has_alpha) {
fs_src[i][3] = alpha;
/* We split the row_mask and row_alpha as we want 128bit interleave */
if (fs_type.length == 8) {
- src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
- src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
+ src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i],
+ 0, src_channels);
+ src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i],
+ src_channels, src_channels);
src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
- src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+ src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+ src_channels, src_channels);
} else {
src_mask[i] = fs_mask[i];
src_alpha[i] = alpha;
}
if (fs_type.length == 8) {
src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
- src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+ src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+ src_channels, src_channels);
} else {
src1_alpha[i] = alpha;
}
}
}
+ /*
+ * We actually should generally do conversion first (for non-1d cases)
+ * when the blend format is 8 or 16 bits. The reason is obvious,
+ * there's 2 or 4 times less vectors to deal with for the interleave...
+ * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
+ * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
+ * unpack only with 128bit vectors).
+ * Note: for 16bit sizes really need matching pack conversion code
+ */
+ if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
+ twiddle_after_convert = TRUE;
+ }
+
/*
* Pixel twiddle from fragment shader order to memory order
*/
- src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
- dst_channels, fs_src, src, pad_inline);
- if (dual_source_blend) {
- generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
- fs_src1, src1, pad_inline);
+ if (!twiddle_after_convert) {
+ src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
+ dst_channels, fs_src, src, pad_inline);
+ if (dual_source_blend) {
+ generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
+ fs_src1, src1, pad_inline);
+ }
+ } else {
+ src_count = num_fullblock_fs * dst_channels;
+ /*
+ * We reorder things a bit here, so the cases for 4-wide and 8-wide
+ * (AVX) turn out the same later when untwiddling/transpose (albeit
+ * for true AVX2 path untwiddle needs to be different).
+ * For now just order by colors first (so we can use unpack later).
+ */
+ for (j = 0; j < num_fullblock_fs; j++) {
+ for (i = 0; i < dst_channels; i++) {
+ src[i*num_fullblock_fs + j] = fs_src[j][i];
+ if (dual_source_blend) {
+ src1[i*num_fullblock_fs + j] = fs_src1[j][i];
+ }
+ }
+ }
}
src_channels = dst_channels < 3 ? dst_channels : 4;
assert(bits == 128 || bits == 256);
}
+ if (twiddle_after_convert) {
+ fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
+ if (dual_source_blend) {
+ fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
+ }
+ }
/*
* Blend Colour conversion
*/
blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
- blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
- blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
+ blend_color = LLVMBuildPointerCast(builder, blend_color,
+ LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
+ blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color,
+ &i32_zero, 1, ""), "");
/* Convert */
lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
mask_type.length = pixels;
mask_type.width = row_type.width * dst_channels;
- src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+ /*
+ * If mask_type width is smaller than 32bit, this doesn't quite
+ * generate the most efficient code (could use some pack).
+ */
+ src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
+ lp_build_int_vec_type(gallivm, mask_type), "");
mask_type.length *= dst_channels;
mask_type.width /= dst_channels;
}
- src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+ src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
+ lp_build_int_vec_type(gallivm, mask_type), "");
src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
}
*/
dst_alignment = MIN2(16, dst_alignment);
+ ls_type = dst_type;
+
+ if (dst_count > src_count) {
+ if ((dst_type.width == 8 || dst_type.width == 16) &&
+ util_is_power_of_two(dst_type.length) &&
+ dst_type.length * dst_type.width < 128) {
+ /*
+ * Never try to load values as 4xi8 which we will then
+ * concatenate to larger vectors. This gives llvm a real
+ * headache (the problem is the type legalizer (?) will
+ * try to load that as 4xi8 zext to 4xi32 to fill the vector,
+ * then the shuffles to concatenate are more or less impossible
+ * - llvm is easily capable of generating a sequence of 32
+ * pextrb/pinsrb instructions for that. Albeit it appears to
+ * be fixed in llvm 4.0. So, load and concatenate with 32bit
+ * width to avoid the trouble (16bit seems not as bad, llvm
+ * probably recognizes the load+shuffle as only one shuffle
+ * is necessary, but we can do just the same anyway).
+ */
+ ls_type.length = dst_type.length * dst_type.width / 32;
+ ls_type.width = 32;
+ }
+ }
+
if (is_1d) {
load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
- dst, dst_type, dst_count / 4, dst_alignment);
+ dst, ls_type, dst_count / 4, dst_alignment);
for (i = dst_count / 4; i < dst_count; i++) {
- dst[i] = lp_build_undef(gallivm, dst_type);
+ dst[i] = lp_build_undef(gallivm, ls_type);
}
}
else {
load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
- dst, dst_type, dst_count, dst_alignment);
+ dst, ls_type, dst_count, dst_alignment);
}
* on all 16 pixels in that single vector at once.
*/
if (dst_count > src_count) {
- lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count);
+ if (ls_type.length != dst_type.length && ls_type.length == 1) {
+ LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
+ LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
+ for (i = 0; i < dst_count; i++) {
+ dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
+ }
+ }
+
+ lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
+
+ if (ls_type.length != dst_type.length) {
+ struct lp_type tmp_type = dst_type;
+ tmp_type.length = dst_type.length * 4 / src_count;
+ for (i = 0; i < src_count; i++) {
+ dst[i] = LLVMBuildBitCast(builder, dst[i],
+ lp_build_vec_type(gallivm, tmp_type), "");
+ }
+ }
}
/*
* It seems some cleanup could be done here (like skipping conversion/blend
* when not needed).
*/
- convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count);
+ convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
+ row_type, dst, src_count);
/*
* FIXME: Really should get logic ops / masks out of generic blend / row
pad_inline ? 4 : dst_channels);
}
- convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count);
+ convert_from_blend_type(gallivm, block_size, out_format_desc,
+ row_type, dst_type, dst, src_count);
/* Split the blend rows back to memory rows */
if (dst_count > src_count) {
for (i = 0; i < key->nr_cbufs; ++i) {
debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
}
- if (key->depth.enabled) {
+ if (key->depth.enabled || key->stencil[0].enabled) {
debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
+ }
+ if (key->depth.enabled) {
debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
debug_printf("depth.writemask = %u\n", key->depth.writemask);
}
static void
llvmpipe_set_constant_buffer(struct pipe_context *pipe,
- uint shader, uint index,
+ enum pipe_shader_type shader, uint index,
const struct pipe_constant_buffer *cb)
{
struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);