From: Roland Scheidegger Date: Thu, 22 Dec 2016 02:48:05 +0000 (+0100) Subject: llvmpipe: use scalar load instead of vectors for small vectors in fs backend X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=53c2d24a24a631a5be0a9c4df88f23dda1e8685c;p=mesa.git llvmpipe: use scalar load instead of vectors for small vectors in fs backend llvm has _huge_ problems trying to load things like <4 x i8> vectors and stitching such loads together to form 128bit vectors. My understanding of the problem is that the type legalizer tries to extend that to really a <4 x i32> vector and not a <16 x i8> vector with the 4 elements first then followed by padding, so the shuffles for then combining things together are more or less impossible - you can in fact see the pmovzxd llvm generates. Pre-4.0 llvm just gives up on it completely and does a 30+ pextrb/pinsrb sequence instead. It looks like current llvm has fixed this behavior (my guess would be due to better shuffle combination and load/shuffle folds), but we can avoid this by just loading as <1 x i32> values, combine that and only cast at the end. (I suspect it might also work if we'd pad the loaded vectors immediately before shuffling them together, instead of directly stitching 2 such vectors together pairwise before combining the pair. But this _might_ lose the ability to load the values directly into their right place in the vector with pinsrd.). But using 32bit values is probably easier for llvm as it will never give it funny ideas how the vector should look like. (This is possibly only a problem for 1x8bit formats, since 2x8bit will end up fetching 64bit hence only two vectors are stitched together, not 4, but we use the same strategy anyway.) Reviewed-by: Jose Fonseca --- diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index e56ce1dc8df..b6f4c2a36c9 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -799,7 +799,8 @@ load_unswizzled_block(struct gallivm_state *gallivm, gep[1] = LLVMBuildAdd(builder, bx, by, ""); dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); - dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), ""); + dst_ptr = LLVMBuildBitCast(builder, dst_ptr, + LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), ""); dst[i] = LLVMBuildLoad(builder, dst_ptr, ""); @@ -843,7 +844,8 @@ store_unswizzled_block(struct gallivm_state *gallivm, gep[1] = LLVMBuildAdd(builder, bx, by, ""); src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); - src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), ""); + src_ptr = LLVMBuildBitCast(builder, src_ptr, + LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), ""); src_ptr = LLVMBuildStore(builder, src[i], src_ptr); @@ -1632,6 +1634,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, struct lp_type blend_type; struct lp_type row_type; struct lp_type dst_type; + struct lp_type ls_type; unsigned char swizzle[TGSI_NUM_CHANNELS]; unsigned vector_width; @@ -2057,17 +2060,41 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, */ dst_alignment = MIN2(16, dst_alignment); + ls_type = dst_type; + + if (dst_count > src_count) { + if ((dst_type.width == 8 || dst_type.width == 16) && + util_is_power_of_two(dst_type.length) && + dst_type.length * dst_type.width < 128) { + /* + * Never try to load values as 4xi8 which we will then + * concatenate to larger vectors. This gives llvm a real + * headache (the problem is the type legalizer (?) will + * try to load that as 4xi8 zext to 4xi32 to fill the vector, + * then the shuffles to concatenate are more or less impossible + * - llvm is easily capable of generating a sequence of 32 + * pextrb/pinsrb instructions for that. Albeit it appears to + * be fixed in llvm 4.0. So, load and concatenate with 32bit + * width to avoid the trouble (16bit seems not as bad, llvm + * probably recognizes the load+shuffle as only one shuffle + * is necessary, but we can do just the same anyway). + */ + ls_type.length = dst_type.length * dst_type.width / 32; + ls_type.width = 32; + } + } + if (is_1d) { load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, - dst, dst_type, dst_count / 4, dst_alignment); + dst, ls_type, dst_count / 4, dst_alignment); for (i = dst_count / 4; i < dst_count; i++) { - dst[i] = lp_build_undef(gallivm, dst_type); + dst[i] = lp_build_undef(gallivm, ls_type); } } else { load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, - dst, dst_type, dst_count, dst_alignment); + dst, ls_type, dst_count, dst_alignment); } @@ -2082,7 +2109,24 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, * on all 16 pixels in that single vector at once. */ if (dst_count > src_count) { - lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count); + if (ls_type.length != dst_type.length && ls_type.length == 1) { + LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type); + LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1); + for (i = 0; i < dst_count; i++) { + dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, ""); + } + } + + lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count); + + if (ls_type.length != dst_type.length) { + struct lp_type tmp_type = dst_type; + tmp_type.length = dst_type.length * 4 / src_count; + for (i = 0; i < src_count; i++) { + dst[i] = LLVMBuildBitCast(builder, dst[i], + lp_build_vec_type(gallivm, tmp_type), ""); + } + } } /*