From 0cd9ad98a3364f9be04964069ec602bb0ba3119d Mon Sep 17 00:00:00 2001 From: George Kyriazis Date: Fri, 19 Jan 2018 15:47:06 -0600 Subject: [PATCH] swr/rast: AVX-512 changes to enable 16-wide VS Add a new define (USE_SIMD16_VS), to denote calling a 16-wide vertex shader. This is needed because the mesa driver can do 16-wide shaders, but rasty cannot yet, so we need to distinguish. Create a new VertexID entry (VertexID16) for the USE_SIMD16_VS case, since we need to format the vertex id in a way that is digestible by the 16-wide VS Disabled for now. To be enabled in a future checkin when driver work is complete. Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/codegen/gen_llvm_types.py | 4 +++ .../drivers/swr/rasterizer/core/frontend.cpp | 28 +++++++++++++------ .../drivers/swr/rasterizer/core/knobs.h | 1 + .../drivers/swr/rasterizer/core/state.h | 4 +++ 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py index 398cde3ed92..a127976fd2d 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py @@ -60,6 +60,10 @@ def gen_llvm_type(type, name, idx, is_pointer, is_pointer_pointer, is_array, is_ llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)' elif type == 'simdscalari': llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)' + elif type == 'simd16scalar': + llvm_type = 'VectorType::get(Type::getFloatTy(ctx), 16)' + elif type == 'simd16scalari': + llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), 16)' elif type == '__m128i': llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), 4)' elif type == 'SIMD256::Float': diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 727b7105cdf..9600f7851ae 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -485,6 +485,13 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) return _simd_castps_si(_simd_vmask_ps(mask)); } +static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining) +{ + uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining; + uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; + return _simd16_castps_si(_simd16_vmask_ps(mask)); +} + ////////////////////////////////////////////////////////////////////////// /// @brief StreamOut - Streams vertex data out to SO buffers. /// Generally, we are only streaming out a SIMDs worth of triangles. @@ -1733,9 +1740,11 @@ void ProcessDraw( // forward fetch generated vertex IDs to the vertex shader #if USE_SIMD16_SHADERS -#if 0 - vsContext_lo.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 0); - vsContext_hi.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 1); +#if USE_SIMD16_VS + vsContext_lo.VertexID16 = _simd16_insert_si( + vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0); + vsContext_lo.VertexID16 = _simd16_insert_si( + vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1); #else vsContext_lo.VertexID = fetchInfo_lo.VertexID; vsContext_hi.VertexID = fetchInfo_lo.VertexID2; @@ -1746,20 +1755,19 @@ void ProcessDraw( #endif // Setup active mask for vertex shader. +#if USE_SIMD16_VS + vsContext_lo.mask16 = GenerateMask16(endVertex - i); +#else vsContext_lo.mask = GenerateMask(endVertex - i); vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH)); +#endif // forward cut mask to the PA if (IsIndexedT::value) { #if USE_SIMD16_SHADERS -#if 0 - *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 0))); - *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 1))); -#else *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2)); -#endif #else *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask)); @@ -1773,12 +1781,16 @@ void ProcessDraw( #endif { AR_BEGIN(FEVertexShader, pDC->drawId); +#if USE_SIMD16_VS + state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo); +#else state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo); if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH { state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi); } +#endif AR_END(FEVertexShader, 0); UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index e00e2da650f..b6ab382a561 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -42,6 +42,7 @@ #define USE_8x2_TILE_BACKEND 1 #define USE_SIMD16_FRONTEND 1 #define USE_SIMD16_SHADERS 0 // requires USE_SIMD16_FRONTEND +#define USE_SIMD16_VS 0 // requires USE_SIMD16_SHADERS /////////////////////////////////////////////////////////////////////////////// // Architecture validation diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index c93c37b4687..c8995b313f9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -227,6 +227,10 @@ struct SWR_VS_CONTEXT simdscalari mask; // IN: Active mask for shader #if USE_SIMD16_FRONTEND uint32_t AlternateOffset; // IN: amount to offset for interleaving even/odd simd8 in simd16vertex output +#if USE_SIMD16_VS + simd16scalari mask16; // IN: Active mask for shader (16-wide) + simd16scalari VertexID16; // IN: Vertex ID (16-wide) +#endif #endif }; -- 2.30.2