From f4821daed1ec185d16f9ee3cc0951d306ce6e2b9 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Thu, 22 Dec 2016 03:55:17 +0100 Subject: [PATCH] llvmpipe: do transpose/untwiddle after conversion for 8bit formats Generally we should do tranpose after conversion, if the format has less than 32 bits per channel (if it has 32 bits, conversion is going to be a no-op anyway...). This is obviously because there's less vectors to deal with. Though the advantage for 16 bit formats isn't that big, and in fact with AVX there isn't really any (as the 32bit unpacks can be done with 256bit, but the smaller ones cannot, although that would change again with proper AVX2 support). Only makes sense for 2d and not 1d cases. And to keep things easy, only handle 1,2 and 4 channels (rgbx is just fine). For rgba unorm8 format the backend conversion sums up to these instruction totals (not counting the movs for SSE2 due to 2-op syntax - generally every 2 unpacks need an additional mov). SSE2 AVX transpose: 32 unpack 16 unpack untwiddle: 0 8 (128bit low/high permutes) convert: 16 mul + 16 cvt 8 mul + 8 cvt 32->8bit: 12 pack 8 (128bit extract) + 12 pack When doing transpose/untwiddle afterwards we get: convert: 16 mul + 16 cvt 8 mul + 8 cvt 32->8bit: 12 pack 8 (128bit extract) + 12 pack transpose/untwiddle 12 unpack 12 unpack So for SSE2, this drops 20 unpacks (total instruction count 76->56) whereas for AVX it replaces the 16 256bit unpacks with 8 128bit ones and drops the 8 lo/hi permutes (in total 60->48). (Albeit to be fair, the permutes could be dropped even when doing the transpose first, they are extremely pointless but we'd need to be able to tell lp_build_conv to reorder the vectors, for AVX2 we're going to need to be able to tell lp_build_conv about ordering in any case.) (With different ordering going into conversion, it would be possible to do 4 unpacks + 4 pshufbs instead of 12 unpacks, but that might not be better, and not all cpus can do it. Proper AVX2 support should eliminate the 8 128bit extracts, reduce these 12 packs to 6 and the 12 unpacks to 2 pshufb + 2 permq ideally (+ 2 final 128bit extracts).) Reviewed-by: Jose Fonseca --- src/gallium/drivers/llvmpipe/lp_state_fs.c | 150 ++++++++++++++++++++- 1 file changed, 143 insertions(+), 7 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 2c0339cad60..af47b5280ce 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -734,6 +734,10 @@ generate_fs_twiddle(struct gallivm_state *gallivm, } } else if (twiddle) { /* Twiddle pixels across elements of array */ + /* + * XXX: we should avoid this in some cases, but would need to tell + * lp_build_conv to reorder (or deal with it ourselves). + */ lp_bld_quad_twiddle(gallivm, type, src, src_count, dst); } else { /* Do nothing */ @@ -764,6 +768,94 @@ generate_fs_twiddle(struct gallivm_state *gallivm, } +/* + * Untwiddle and transpose, much like the above. + * However, this is after conversion, so we get packed vectors. + * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data, + * the vectors will look like: + * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may + * be swizzled here). Extending to 16bit should be trivial. + * Should also be extended to handle twice wide vectors with AVX2... + */ +static void +fs_twiddle_transpose(struct gallivm_state *gallivm, + struct lp_type type, + LLVMValueRef *src, + unsigned src_count, + LLVMValueRef *dst) +{ + unsigned i, j; + struct lp_type type64, type16, type32; + LLVMTypeRef type64_t, type8_t, type16_t, type32_t; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef tmp[4], shuf[8]; + for (j = 0; j < 2; j++) { + shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0); + shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2); + shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1); + shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3); + } + + assert(src_count == 4 || src_count == 2 || src_count == 1); + assert(type.width == 8); + assert(type.length == 16); + + type8_t = lp_build_vec_type(gallivm, type); + + type64 = type; + type64.length /= 8; + type64.width *= 8; + type64_t = lp_build_vec_type(gallivm, type64); + + type16 = type; + type16.length /= 2; + type16.width *= 2; + type16_t = lp_build_vec_type(gallivm, type16); + + type32 = type; + type32.length /= 4; + type32.width *= 4; + type32_t = lp_build_vec_type(gallivm, type32); + + lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp); + + if (src_count == 1) { + /* transpose was no-op, just untwiddle */ + LLVMValueRef shuf_vec; + shuf_vec = LLVMConstVector(shuf, 8); + tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, ""); + tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, ""); + dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, ""); + } else if (src_count == 2) { + LLVMValueRef shuf_vec; + shuf_vec = LLVMConstVector(shuf, 4); + + for (i = 0; i < 2; i++) { + tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, ""); + tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, ""); + dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, ""); + } + } else { + for (j = 0; j < 2; j++) { + LLVMValueRef lo, hi, lo2, hi2; + /* + * Note that if we only really have 3 valid channels (rgb) + * and we don't need alpha we could substitute a undef here + * for the respective channel (causing llvm to drop conversion + * for alpha). + */ + /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */ + lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, ""); + hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, ""); + lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0); + hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1); + dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, ""); + dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, ""); + } + } +} + + /** * Load an unswizzled block of pixels from memory */ @@ -1656,6 +1748,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, util_blend_state_is_dual(&variant->key.blend, 0); const boolean is_1d = variant->key.resource_1d; + boolean twiddle_after_convert = FALSE; unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs; LLVMValueRef fpstate = 0; @@ -1872,14 +1965,45 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, } } + /* + * We actually should generally do conversion first (for non-1d cases) + * when the blend format is 8 or 16 bits. The reason is obvious, + * there's 2 or 4 times less vectors to deal with for the interleave... + * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit + * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit + * unpack only with 128bit vectors). + * Note: for 16bit sizes really need matching pack conversion code + */ + if (!is_1d && dst_channels != 3 && dst_type.width == 8) { + twiddle_after_convert = TRUE; + } + /* * Pixel twiddle from fragment shader order to memory order */ - src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, - dst_channels, fs_src, src, pad_inline); - if (dual_source_blend) { - generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, - fs_src1, src1, pad_inline); + if (!twiddle_after_convert) { + src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, + dst_channels, fs_src, src, pad_inline); + if (dual_source_blend) { + generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, + fs_src1, src1, pad_inline); + } + } else { + src_count = num_fullblock_fs * dst_channels; + /* + * We reorder things a bit here, so the cases for 4-wide and 8-wide + * (AVX) turn out the same later when untwiddling/transpose (albeit + * for true AVX2 path untwiddle needs to be different). + * For now just order by colors first (so we can use unpack later). + */ + for (j = 0; j < num_fullblock_fs; j++) { + for (i = 0; i < dst_channels; i++) { + src[i*num_fullblock_fs + j] = fs_src[j][i]; + if (dual_source_blend) { + src1[i*num_fullblock_fs + j] = fs_src1[j][i]; + } + } + } } src_channels = dst_channels < 3 ? dst_channels : 4; @@ -1923,6 +2047,12 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, assert(bits == 128 || bits == 256); } + if (twiddle_after_convert) { + fs_twiddle_transpose(gallivm, row_type, src, src_count, src); + if (dual_source_blend) { + fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1); + } + } /* * Blend Colour conversion @@ -2008,13 +2138,19 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, mask_type.length = pixels; mask_type.width = row_type.width * dst_channels; - src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), ""); + /* + * If mask_type width is smaller than 32bit, this doesn't quite + * generate the most efficient code (could use some pack). + */ + src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], + lp_build_int_vec_type(gallivm, mask_type), ""); mask_type.length *= dst_channels; mask_type.width /= dst_channels; } - src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), ""); + src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], + lp_build_int_vec_type(gallivm, mask_type), ""); src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length); } -- 2.30.2