From: George Kyriazis Date: Tue, 20 Mar 2018 23:13:35 +0000 (-0500) Subject: swr/rast: Lower PERMD and PERMPS to x86. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d68694016c7e924d7782a16577b3bc278dd681a6;p=mesa.git swr/rast: Lower PERMD and PERMPS to x86. Add support for providing an emulation callback function for arch/width combinations that don't map cleanly to an x86 intrinsic. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index 68695c46c81..d8ec885308a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -555,7 +555,7 @@ namespace SwrJit // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - Value* vi128XY = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + Value* vi128XY = BITCAST(VPERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy @@ -565,7 +565,7 @@ namespace SwrJit if (info.numComps > 2) { Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128ZW = BITCAST(VPERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); } for (uint32_t i = 0; i < 4; i++) @@ -644,7 +644,7 @@ namespace SwrJit // 256i - 0 1 2 3 4 5 6 7 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww - Value* vi128XY = BITCAST(PERMD(vShufResult, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); + Value* vi128XY = BITCAST(VPERMD(vShufResult, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) @@ -653,7 +653,7 @@ namespace SwrJit Value* vi128ZW = nullptr; if (info.numComps > 2) { - vi128ZW = BITCAST(PERMD(vShufResult, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); + vi128ZW = BITCAST(VPERMD(vShufResult, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); } // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 54987c77246..aa9e2dddee8 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -601,76 +601,6 @@ namespace SwrJit return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); } - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPERMD operation (shuffle 32 bit integer values - /// across 128 bit lanes) in LLVM IR. If not supported on the underlying - /// platform, emulate it - /// @param a - 256bit SIMD lane(8x32bit) of integer values. - /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values - Value *Builder::PERMD(Value* a, Value* idx) - { - Value* res; - // use avx2 permute instruction if available - if(JM()->mArch.AVX2()) - { - res = VPERMD(a, idx); - } - else - { - if (isa(idx)) - { - res = VSHUFFLE(a, a, idx); - } - else - { - res = VUNDEF_I(); - for (uint32_t l = 0; l < JM()->mVWidth; ++l) - { - Value* pIndex = VEXTRACT(idx, C(l)); - Value* pVal = VEXTRACT(a, pIndex); - res = VINSERT(res, pVal, C(l)); - } - } - } - return res; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPERMPS operation (shuffle 32 bit float values - /// across 128 bit lanes) in LLVM IR. If not supported on the underlying - /// platform, emulate it - /// @param a - 256bit SIMD lane(8x32bit) of float values. - /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values - Value *Builder::PERMPS(Value* a, Value* idx) - { - Value* res; - // use avx2 permute instruction if available - if (JM()->mArch.AVX2()) - { - // llvm 3.6.0 swapped the order of the args to vpermd - res = VPERMPS(idx, a); - } - else - { - if (isa(idx)) - { - res = VSHUFFLE(a, a, idx); - } - else - { - res = VUNDEF_F(); - for (uint32_t l = 0; l < JM()->mVWidth; ++l) - { - Value* pIndex = VEXTRACT(idx, C(l)); - Value* pVal = VEXTRACT(a, pIndex); - res = VINSERT(res, pVal, C(l)); - } - } - } - - return res; - } - ////////////////////////////////////////////////////////////////////////// /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 343a9b03670..7308821c89e 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -113,8 +113,6 @@ Value *JOIN_16(Value *a, Value *b); Value *PSHUFB(Value* a, Value* b); Value *PMOVSXBD(Value* a); Value *PMOVSXWD(Value* a); -Value *PERMD(Value* a, Value* idx); -Value *PERMPS(Value* a, Value* idx); Value *CVTPH2PS(Value* a, const llvm::Twine& name = ""); Value *CVTPS2PH(Value* a, Value* rounding); Value *PMAXSD(Value* a, Value* b); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index f9293aa3b4b..da6d982d0e3 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1955,8 +1955,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) Value *vi128XY_hi = nullptr; if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) { - vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); - vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); + vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); + vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane // 256i - 0 1 2 3 4 5 6 7 @@ -1968,8 +1968,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) Value *vi128ZW_hi = nullptr; if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { - vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); - vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); + vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); + vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); } // init denormalize variables if needed @@ -2306,8 +2306,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 @@ -2325,8 +2325,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); - vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); } // init denormalize variables if needed @@ -2547,7 +2547,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - vi128XY = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY = BITCAST(VPERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy @@ -2557,7 +2557,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) Value* vi128ZW = nullptr; if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128ZW = BITCAST(VPERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); } // init denormalize variables if needed