From: José Fonseca Date: Wed, 20 Nov 2013 08:32:52 +0000 (+0000) Subject: gallivm: Basic AVX2 support. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=e088390c7ddce4e64559a5dad6235ffc430ac736;p=mesa.git gallivm: Basic AVX2 support. v2: pblendb -> pblendvb Reviewed-by: Roland Scheidegger --- diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index c4e35a21d26..f5cacc460f2 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -142,6 +142,20 @@ lp_build_min_simple(struct lp_build_context *bld, intrinsic = "llvm.ppc.altivec.vminfp"; intr_size = 128; } + } else if (HAVE_LLVM < 0x0309 && + util_cpu_caps.has_avx2 && type.length > 4) { + intr_size = 256; + switch (type.width) { + case 8: + intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b"; + break; + case 16: + intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w"; + break; + case 32: + intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d"; + break; + } } else if (HAVE_LLVM < 0x0309 && util_cpu_caps.has_sse2 && type.length >= 2) { intr_size = 128; @@ -346,6 +360,20 @@ lp_build_max_simple(struct lp_build_context *bld, intrinsic = "llvm.ppc.altivec.vmaxfp"; intr_size = 128; } + } else if (HAVE_LLVM < 0x0309 && + util_cpu_caps.has_avx2 && type.length > 4) { + intr_size = 256; + switch (type.width) { + case 8: + intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b"; + break; + case 16: + intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w"; + break; + case 32: + intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d"; + break; + } } else if (HAVE_LLVM < 0x0309 && util_cpu_caps.has_sse2 && type.length >= 2) { intr_size = 128; @@ -526,18 +554,27 @@ lp_build_add(struct lp_build_context *bld, if(a == bld->one || b == bld->one) return bld->one; - if (type.width * type.length == 128 && - !type.floating && !type.fixed) { - if(util_cpu_caps.has_sse2) { - if(type.width == 8) - intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; - if(type.width == 16) - intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; - } else if (util_cpu_caps.has_altivec) { - if(type.width == 8) - intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; - if(type.width == 16) - intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; + if (!type.floating && !type.fixed) { + if (type.width * type.length == 128) { + if(util_cpu_caps.has_sse2) { + if(type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; + if(type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; + } else if (util_cpu_caps.has_altivec) { + if(type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; + if(type.width == 16) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; + } + } + if (type.width * type.length == 256) { + if(util_cpu_caps.has_avx2) { + if(type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; + if(type.width == 16) + intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; + } } } @@ -818,18 +855,27 @@ lp_build_sub(struct lp_build_context *bld, if(b == bld->one) return bld->zero; - if (type.width * type.length == 128 && - !type.floating && !type.fixed) { - if (util_cpu_caps.has_sse2) { - if(type.width == 8) - intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; - if(type.width == 16) - intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; - } else if (util_cpu_caps.has_altivec) { - if(type.width == 8) - intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; - if(type.width == 16) - intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; + if (!type.floating && !type.fixed) { + if (type.width * type.length == 128) { + if (util_cpu_caps.has_sse2) { + if(type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; + if(type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; + } else if (util_cpu_caps.has_altivec) { + if(type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; + if(type.width == 16) + intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; + } + } + if (type.width * type.length == 256) { + if (util_cpu_caps.has_avx2) { + if(type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; + if(type.width == 16) + intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w"; + } } } @@ -1587,6 +1633,16 @@ lp_build_abs(struct lp_build_context *bld, return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); } } + else if (type.width*type.length == 256 && util_cpu_caps.has_avx2) { + switch(type.width) { + case 8: + return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); + case 16: + return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a); + case 32: + return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a); + } + } else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 && (gallivm_debug & GALLIVM_DEBUG_PERF) && (type.width == 8 || type.width == 16 || type.width == 32)) { diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 22340c081f2..7114cde4384 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -457,6 +457,11 @@ lp_build_init(void) util_cpu_caps.has_f16c = 0; util_cpu_caps.has_fma = 0; } + if (HAVE_LLVM < 0x0304 || !USE_MCJIT) { + /* AVX2 support has only been tested with LLVM 3.4, and it requires + * MCJIT. */ + util_cpu_caps.has_avx2 = 0; + } #ifdef PIPE_ARCH_PPC_64 /* Set the NJ bit in VSCR to 0 so denormalized values are handled as diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index 14bf2369482..1a50e82c241 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -348,7 +348,9 @@ lp_build_select(struct lp_build_context *bld, else if (((util_cpu_caps.has_sse4_1 && type.width * type.length == 128) || (util_cpu_caps.has_avx && - type.width * type.length == 256 && type.width >= 32)) && + type.width * type.length == 256 && type.width >= 32) || + (util_cpu_caps.has_avx2 && + type.width * type.length == 256)) && !LLVMIsConstant(a) && !LLVMIsConstant(b) && !LLVMIsConstant(mask)) { @@ -365,9 +367,13 @@ lp_build_select(struct lp_build_context *bld, intrinsic = "llvm.x86.avx.blendv.pd.256"; arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4); } - else { + else if (type.width == 32) { intrinsic = "llvm.x86.avx.blendv.ps.256"; arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); + } else { + assert(util_cpu_caps.has_avx2); + intrinsic = "llvm.x86.avx2.pblendvb"; + arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32); } } else if (type.floating && diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index 6bf92c87c49..f91b761dc11 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -1409,6 +1409,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, LLVMValueRef mipoff1 = NULL; LLVMValueRef colors0; LLVMValueRef colors1; + boolean use_floats = util_cpu_caps.has_avx && + !util_cpu_caps.has_avx2 && + bld->coord_type.length > 4; /* sample the first mipmap level */ lp_build_mipmap_level_sizes(bld, ilevel0, @@ -1423,7 +1426,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, mipoff0 = lp_build_get_mip_offsets(bld, ilevel0); } - if (util_cpu_caps.has_avx && bld->coord_type.length > 4) { + if (use_floats) { if (img_filter == PIPE_TEX_FILTER_NEAREST) { lp_build_sample_image_nearest_afloat(bld, size0, @@ -1514,7 +1517,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld, mipoff1 = lp_build_get_mip_offsets(bld, ilevel1); } - if (util_cpu_caps.has_avx && bld->coord_type.length > 4) { + if (use_floats) { if (img_filter == PIPE_TEX_FILTER_NEAREST) { lp_build_sample_image_nearest_afloat(bld, size1,