From f64aea0959af955841bbde96885aebacb44b4aaf Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 8 May 2017 12:45:20 -0500 Subject: [PATCH] swr/rast: SIMD16 FE - interleaved simdvertex output in GS Eliminates conversion copies on GS output from simdvertex to simd16vertex. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/frontend.cpp | 22 ++++---------- src/gallium/drivers/swr/swr_shader.cpp | 29 +++++++++++++++++-- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 3886c64ccf6..e88246f478f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -717,10 +717,6 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num THREAD SWR_GS_CONTEXT tlsGsContext; -#if USE_SIMD16_FRONTEND -THREAD simd16vertex tempVertex_simd16[128]; - -#endif template struct GsBufferInfo { @@ -819,7 +815,11 @@ static void GeometryShaderStage( tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; } +#if USE_SIMD16_FRONTEND + const GsBufferInfo bufferInfo(state.gsState); +#else const GsBufferInfo bufferInfo(state.gsState); +#endif // record valid prims from the frontend to avoid over binning the newly generated // prims from the GS @@ -923,19 +923,7 @@ static void GeometryShaderStage( } #if USE_SIMD16_FRONTEND - // TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex - - SWR_ASSERT(numEmittedVerts <= 256); - - PackPairsOfSimdVertexIntoSimd16Vertex( - tempVertex_simd16, - reinterpret_cast(pBase), - numEmittedVerts, - SWR_VTX_NUM_SLOTS); - -#endif -#if USE_SIMD16_FRONTEND - PA_STATE_CUT gsPa(pDC, reinterpret_cast(tempVertex_simd16), numEmittedVerts, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); + PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); #else PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index d55820eb754..2f495f59c23 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -370,8 +370,13 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder))); +#if USE_SIMD16_FRONTEND + const uint32_t simdVertexStride = sizeof(simdvertex) * 2; + const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2); +#else const uint32_t simdVertexStride = sizeof(simdvertex); - const uint32_t numSimdBatches = (pGS->maxNumVerts + 7) / 8; + const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth; +#endif const uint32_t inputPrimStride = numSimdBatches * simdVertexStride; Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream }); @@ -388,8 +393,14 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base inputPrimStride * 6, inputPrimStride * 7 } ); - Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), 3); - Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), 7); +#if USE_SIMD16_FRONTEND + const uint32_t simdShift = log2(mVWidth * 2); + Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1); +#else + const uint32_t simdShift = log2(mVWidth); + Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1); +#endif + Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift); for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) { uint32_t attribSlot = attrib; @@ -400,10 +411,17 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER) attribSlot = VERTEX_RTAI_SLOT; +#if USE_SIMD16_FRONTEND + Value *vOffsetsAttrib = + ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2))); + vOffsetsAttrib = + ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2))); +#else Value *vOffsetsAttrib = ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex)))); vOffsetsAttrib = ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector)))); +#endif vOffsetsAttrib = ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float)))); @@ -416,8 +434,13 @@ BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base MASKED_SCATTER(vData, vPtrs, 32, vMask1); +#if USE_SIMD16_FRONTEND + vOffsetsAttrib = + ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2)); +#else vOffsetsAttrib = ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar))); +#endif } } } -- 2.30.2