From 08a71368483f2e35b135ebe56ec5746cc94ac452 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 6 Apr 2017 15:22:55 -0500 Subject: [PATCH] swr: [rasterizer core] SIMD16 Frontend WIP - Clipper Implement widened clipper for SIMD16. Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/common/simd16intrin.h | 41 +- .../drivers/swr/rasterizer/core/binner.cpp | 17 +- .../drivers/swr/rasterizer/core/clip.cpp | 91 +- .../drivers/swr/rasterizer/core/clip.h | 1027 +++++++++++++++-- .../drivers/swr/rasterizer/core/frontend.h | 29 +- 5 files changed, 1011 insertions(+), 194 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index e5c34c2c876..fee50d0637e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -436,7 +436,7 @@ INLINE simd16scalar _simd16_cvtepi32_ps(simd16scalari a) } template -INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b) +INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b) { simd16scalar result; @@ -446,12 +446,14 @@ INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b) return result; } -#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b) -#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b) -#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b) -#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b) -#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b) -#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b) +#define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp(a, b) + +#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ) +#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ) +#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ) +#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ) +#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ) +#define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ) SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps) SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps) @@ -525,8 +527,8 @@ SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32) INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) { - int lo = _mm256_testz_ps(a.lo, b.lo); - int hi = _mm256_testz_ps(a.hi, b.hi); + int lo = _simd_testz_ps(a.lo, b.lo); + int hi = _simd_testz_ps(a.hi, b.hi); return lo & hi; } @@ -912,19 +914,19 @@ INLINE int _simd16_movemask_epi8(simd16scalari a) template INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b) { - simd16mask k = _mm512_cmpeq_ps_mask(a, b); + simd16mask k = _mm512_cmp_ps_mask(a, b, comp); return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF))); } #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp(a, b) -#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b) -#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b) -#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b) -#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b) -#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b) -#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b) +#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ) +#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ) +#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ) +#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ) +#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ) +#define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ) #define _simd16_castsi_ps _mm512_castsi512_ps #define _simd16_castps_si _mm512_castps_si512 @@ -982,17 +984,14 @@ INLINE simd16scalari _simd16_cmplt_epi32(simd16scalari a, simd16scalari b) return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)); } -#if 0 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) { - int lo = _mm256_testz_ps(a.lo, b.lo); - int hi = _mm256_testz_ps(a.hi, b.hi); + int lo = _simd_testz_ps(_simd16_extract_ps(a, 0), _simd16_extract_ps(b, 0)); + int hi = _simd_testz_ps(_simd16_extract_ps(a, 1), _simd16_extract_ps(b, 1)); return lo & hi; } -#endif - #define _simd16_unpacklo_ps _mm512_unpacklo_ps #define _simd16_unpackhi_ps _mm512_unpackhi_ps #define _simd16_unpacklo_pd _mm512_unpacklo_pd diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index eb1f20b5a35..239c49741b9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -1007,16 +1007,6 @@ endBinTriangles: } #if USE_SIMD16_FRONTEND -inline uint32_t GetPrimMaskLo(uint32_t primMask) -{ - return primMask & 255; -} - -inline uint32_t GetPrimMaskHi(uint32_t primMask) -{ - return (primMask >> 8) & 255; -} - template void BinTriangles_simd16( DRAW_CONTEXT *pDC, @@ -1105,8 +1095,8 @@ void BinTriangles_simd16( calcDeterminantIntVertical(vAi, vBi, vDet); // cull zero area - simd16mask maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si()))); - simd16mask maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si()))); + uint32_t maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si()))); + uint32_t maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si()))); uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2)); @@ -1133,7 +1123,6 @@ void BinTriangles_simd16( maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0]))); maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1]))); } - frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2)); // cull @@ -2756,7 +2745,7 @@ void BinLines_simd16( { viewportTransform<2>(prim, state.vpMatrices); } -} + } // adjust for pixel center location simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation]; diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index 6fc7e162b4f..6a5bf6ccde2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -33,6 +33,9 @@ // Temp storage used by the clipper THREAD simdvertex tlsTempVertices[7]; +#if USE_SIMD16_FRONTEND +THREAD simd16vertex tlsTempVertices_simd16[7]; +#endif float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) { @@ -185,16 +188,6 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p } #if USE_SIMD16_FRONTEND -inline uint32_t GetPrimMaskLo(uint32_t primMask) -{ - return primMask & 255; -} - -inline uint32_t GetPrimMaskHi(uint32_t primMask) -{ - return (primMask >> 8) & 255; -} - void ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx) { SWR_CONTEXT *pContext = pDC->pContext; @@ -204,32 +197,8 @@ void ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, si Clipper clipper(workerId, pDC); - simdvector verts[VERTS_PER_PRIM]; - - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prims[i][j], 0); - } - } - pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0)); - - if (GetPrimMaskHi(primMask)) - { - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prims[i][j], 1); - } - } - - pa.useAlternateOffset = true; - clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1)); - } + clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); AR_END(FEClipTriangles, 1); } @@ -243,32 +212,8 @@ void ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16 Clipper clipper(workerId, pDC); - simdvector verts[VERTS_PER_PRIM]; - - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prims[i][j], 0); - } - } - pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0)); - - if (GetPrimMaskHi(primMask)) - { - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prims[i][j], 1); - } - } - - pa.useAlternateOffset = true; - clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1)); - } + clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); AR_END(FEClipLines, 1); } @@ -282,32 +227,8 @@ void ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd1 Clipper clipper(workerId, pDC); - simdvector verts[VERTS_PER_PRIM]; - - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prims[i][j], 0); - } - } - pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0)); - - if (GetPrimMaskHi(primMask)) - { - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prims[i][j], 1); - } - } - - pa.useAlternateOffset = true; - clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1)); - } + clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); AR_END(FEClipPoints, 1); } diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 017f5e795c4..39f29c16661 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -34,6 +34,9 @@ // Temp storage used by the clipper extern THREAD simdvertex tlsTempVertices[7]; +#if USE_SIMD16_FRONTEND +extern THREAD simd16vertex tlsTempVertices_simd16[7]; +#endif enum SWR_CLIPCODES { @@ -126,6 +129,76 @@ void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscal clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM)))); } +#if USE_SIMD16_FRONTEND +INLINE +void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari viewportIndexes) +{ + clipCodes = _simd16_setzero_ps(); + + // -w + simd16scalar vNegW = _simd16_mul_ps(vertex.w, _simd16_set1_ps(-1.0f)); + + // FRUSTUM_LEFT + simd16scalar vRes = _simd16_cmplt_ps(vertex.x, vNegW); + clipCodes = _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_LEFT))); + + // FRUSTUM_TOP + vRes = _simd16_cmplt_ps(vertex.y, vNegW); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_TOP)))); + + // FRUSTUM_RIGHT + vRes = _simd16_cmpgt_ps(vertex.x, vertex.w); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_RIGHT)))); + + // FRUSTUM_BOTTOM + vRes = _simd16_cmpgt_ps(vertex.y, vertex.w); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_BOTTOM)))); + + if (state.rastState.depthClipEnable) + { + // FRUSTUM_NEAR + // DX clips depth [0..w], GL clips [-w..w] + if (state.rastState.clipHalfZ) + { + vRes = _simd16_cmplt_ps(vertex.z, _simd16_setzero_ps()); + } + else + { + vRes = _simd16_cmplt_ps(vertex.z, vNegW); + } + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_NEAR)))); + + // FRUSTUM_FAR + vRes = _simd16_cmpgt_ps(vertex.z, vertex.w); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_FAR)))); + } + + // NEGW + vRes = _simd16_cmple_ps(vertex.w, _simd16_setzero_ps()); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(NEGW)))); + + // GUARDBAND_LEFT + simd16scalar gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4)); + vRes = _simd16_cmplt_ps(vertex.x, gbMult); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_LEFT)))); + + // GUARDBAND_TOP + gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4)); + vRes = _simd16_cmplt_ps(vertex.y, gbMult); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_TOP)))); + + // GUARDBAND_RIGHT + gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4)); + vRes = _simd16_cmpgt_ps(vertex.x, gbMult); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_RIGHT)))); + + // GUARDBAND_BOTTOM + gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4)); + vRes = _simd16_cmpgt_ps(vertex.y, gbMult); + clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_BOTTOM)))); +} + +#endif template class Clipper { @@ -144,6 +217,16 @@ public: } } +#if USE_SIMD16_FRONTEND + void ComputeClipCodes(simd16vector vertex[], simd16scalari viewportIndexes) + { + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + ::ComputeClipCodes(this->state, vertex[i], this->clipCodes_simd16[i], viewportIndexes); + } + } + +#endif simdscalar ComputeClipCodeIntersection() { simdscalar result = this->clipCodes[0]; @@ -154,6 +237,18 @@ public: return result; } +#if USE_SIMD16_FRONTEND + simd16scalar ComputeClipCodeIntersection_simd16() + { + simd16scalar result = this->clipCodes_simd16[0]; + for (uint32_t i = 1; i < NumVertsPerPrim; ++i) + { + result = _simd16_and_ps(result, this->clipCodes_simd16[i]); + } + return result; + } + +#endif simdscalar ComputeClipCodeUnion() { simdscalar result = this->clipCodes[0]; @@ -164,6 +259,18 @@ public: return result; } +#if USE_SIMD16_FRONTEND + simd16scalar ComputeClipCodeUnion_simd16() + { + simd16scalar result = this->clipCodes_simd16[0]; + for (uint32_t i = 1; i < NumVertsPerPrim; ++i) + { + result = _simd16_or_ps(result, this->clipCodes_simd16[i]); + } + return result; + } + +#endif int ComputeNegWMask() { simdscalar clipCodeUnion = ComputeClipCodeUnion(); @@ -178,6 +285,15 @@ public: return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps())); } +#if USE_SIMD16_FRONTEND + int ComputeClipMask_simd16() + { + simd16scalar clipUnion = ComputeClipCodeUnion_simd16(); + clipUnion = _simd16_and_ps(clipUnion, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_CLIP_MASK))); + return _simd16_movemask_ps(_simd16_cmpneq_ps(clipUnion, _simd16_setzero_ps())); + } + +#endif // clipper is responsible for culling any prims with NAN coordinates int ComputeNaNMask(simdvector prim[]) { @@ -193,6 +309,22 @@ public: return _simd_movemask_ps(vNanMask); } +#if USE_SIMD16_FRONTEND + int ComputeNaNMask(simd16vector prim[]) + { + simd16scalar vNanMask = _simd16_setzero_ps(); + for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + { + simd16scalar vNan01 = _simd16_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q); + vNanMask = _simd16_or_ps(vNanMask, vNan01); + simd16scalar vNan23 = _simd16_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q); + vNanMask = _simd16_or_ps(vNanMask, vNan23); + } + + return _simd16_movemask_ps(vNanMask); + } + +#endif int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[]) { uint8_t cullMask = this->state.rastState.cullDistanceMask; @@ -258,6 +390,74 @@ public: return _simd_movemask_ps(vClipCullMask); } +#if USE_SIMD16_FRONTEND + int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[]) + { + uint8_t cullMask = this->state.rastState.cullDistanceMask; + simd16scalar vClipCullMask = _simd16_setzero_ps(); + + simd16vector vClipCullDistLo[3]; + simd16vector vClipCullDistHi[3]; + + pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo); + pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi); + + DWORD index; + while (_BitScanForward(&index, cullMask)) + { + cullMask &= ~(1 << index); + uint32_t slot = index >> 2; + uint32_t component = index & 0x3; + + simd16scalar vCullMaskElem = _simd16_set1_ps(-1.0f); + for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + { + simd16scalar vCullComp; + if (slot == 0) + { + vCullComp = vClipCullDistLo[e][component]; + } + else + { + vCullComp = vClipCullDistHi[e][component]; + } + + // cull if cull distance < 0 || NAN + simd16scalar vCull = _simd16_cmp_ps(_simd16_setzero_ps(), vCullComp, _CMP_NLE_UQ); + vCullMaskElem = _simd16_and_ps(vCullMaskElem, vCull); + } + vClipCullMask = _simd16_or_ps(vClipCullMask, vCullMaskElem); + } + + // clipper should also discard any primitive with NAN clip distance + uint8_t clipMask = this->state.rastState.clipDistanceMask; + while (_BitScanForward(&index, clipMask)) + { + clipMask &= ~(1 << index); + uint32_t slot = index >> 2; + uint32_t component = index & 0x3; + + for (uint32_t e = 0; e < NumVertsPerPrim; ++e) + { + simd16scalar vClipComp; + if (slot == 0) + { + vClipComp = vClipCullDistLo[e][component]; + } + else + { + vClipComp = vClipCullDistHi[e][component]; + } + + simd16scalar vClip = _simd16_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q); + vClipCullMask = _simd16_or_ps(vClipCullMask, vClip); + } + } + + return _simd16_movemask_ps(vClipCullMask); + } + +#endif // clip SIMD primitives void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx) { @@ -516,91 +716,396 @@ public: UPDATE_STAT_FE(CPrimitives, numClippedPrims); } - // execute the clipper stage - void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) +#if USE_SIMD16_FRONTEND + void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId, const simd16scalari& vViewportIdx) { - SWR_ASSERT(pa.pDC != nullptr); - SWR_CONTEXT* pContext = pa.pDC->pContext; + // input/output vertex store for clipper + simd16vertex vertices[7]; // maximum 7 verts generated per triangle - // set up binner based on PA state - PFN_PROCESS_PRIMS pfnBinner; - switch (pa.binTopology) + LONG constantInterpMask = this->state.backendState.constantInterpolationMask; + uint32_t provokingVertex = 0; + if (pa.binTopology == TOP_TRIANGLE_FAN) { - case TOP_POINT_LIST: - pfnBinner = BinPoints; - break; - case TOP_LINE_LIST: - case TOP_LINE_STRIP: - case TOP_LINE_LOOP: - case TOP_LINE_LIST_ADJ: - case TOP_LISTSTRIP_ADJ: - pfnBinner = BinLines; - break; - default: - pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0)); - break; - }; - - // update clipper invocations pipeline stat - uint32_t numInvoc = _mm_popcnt_u32(primMask); - UPDATE_STAT_FE(CInvocations, numInvoc); - - ComputeClipCodes(prim, viewportIdx); - - // cull prims with NAN coords - primMask &= ~ComputeNaNMask(prim); + provokingVertex = this->state.frontendState.provokingVertex.triFan; + } + ///@todo: line topology for wireframe? - // user cull distance cull - if (this->state.rastState.cullDistanceMask) + // assemble pos + simd16vector tmpVector[NumVertsPerPrim]; + pa.Assemble_simd16(VERTEX_POSITION_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { - primMask &= ~ComputeUserClipCullMask(pa, prim); + vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i]; } - // cull prims outside view frustum - simdscalar clipIntersection = ComputeClipCodeIntersection(); - int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps())); + // assemble attribs + const SWR_BACKEND_STATE& backendState = this->state.backendState; - // skip clipping for points - uint32_t clipMask = 0; - if (NumVertsPerPrim != 1) + int32_t maxSlot = -1; + for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot) { - clipMask = primMask & ComputeClipMask(); + // Compute absolute attrib slot in vertex array + uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot; + maxSlot = std::max(maxSlot, mapSlot); + uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot; + + pa.Assemble_simd16(inputSlot, tmpVector); + + // if constant interpolation enabled for this attribute, assign the provoking + // vertex values to all edges + if (_bittest(&constantInterpMask, slot)) + { + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[inputSlot] = tmpVector[provokingVertex]; + } + } + else + { + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[inputSlot] = tmpVector[i]; + } + } } - if (clipMask) + // assemble user clip distances if enabled + if (this->state.rastState.clipDistanceMask & 0xf) { - AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); - // we have to clip tris, execute the clipper, which will also - // call the binner - ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx); - AR_END(FEGuardbandClip, 1); + pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i]; + } } - else if (validMask) - { - // update CPrimitives pipeline state - UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); - // forward valid prims directly to binner - pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx); + if (this->state.rastState.clipDistanceMask & 0xf0) + { + pa.Assemble_simd16(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i]; + } } - } -private: - inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1) - { - return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1)); - } + uint32_t numAttribs = maxSlot + 1; - inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component) - { - const uint32_t simdVertexStride = sizeof(simdvertex); - const uint32_t componentStride = sizeof(simdscalar); - const uint32_t attribStride = sizeof(simdvector); - const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float), - 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float)); + simd16scalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); - // step to the simdvertex - simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride)); + // set up new PA for binning clipped primitives + PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc = nullptr; + PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN; + if (NumVertsPerPrim == 3) + { + pfnBinFunc = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0)); + clipTopology = TOP_TRIANGLE_FAN; + + // so that the binner knows to bloat wide points later + if (pa.binTopology == TOP_POINT_LIST) + clipTopology = TOP_POINT_LIST; + + } + else if (NumVertsPerPrim == 2) + { + pfnBinFunc = BinLines_simd16; + clipTopology = TOP_LINE_LIST; + } + else + { + SWR_ASSERT(0 && "Unexpected points in clipper."); + } + + uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; + uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; + uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx; + + const simdscalari vOffsets = _simd_set_epi32( + 0 * sizeof(simd16vertex), // unused lane + 6 * sizeof(simd16vertex), + 5 * sizeof(simd16vertex), + 4 * sizeof(simd16vertex), + 3 * sizeof(simd16vertex), + 2 * sizeof(simd16vertex), + 1 * sizeof(simd16vertex), + 0 * sizeof(simd16vertex)); + + // only need to gather 7 verts + // @todo dynamic mask based on actual # of verts generated per lane + const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1); + + uint32_t numClippedPrims = 0; + + // tranpose clipper output so that each lane's vertices are in SIMD order + // set aside space for 2 vertices, as the PA will try to read up to 16 verts + // for triangle fan + +#if defined(_DEBUG) + // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds + simd16vertex *transposedPrims = reinterpret_cast(malloc(sizeof(simd16vertex) * 2)); + +#else + simd16vertex transposedPrims[2]; + +#endif + for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) + { + uint32_t numEmittedVerts = pVertexCount[inputPrim]; + if (numEmittedVerts < NumVertsPerPrim) + { + continue; + } + SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper."); + + uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts); + numClippedPrims += numEmittedPrims; + + // tranpose clipper output so that each lane's vertices are in SIMD order + // set aside space for 2 vertices, as the PA will try to read up to 16 verts + // for triangle fan + + // transpose pos + uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; + +#if 0 + // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug + static const float *dummy = reinterpret_cast(pBase); +#endif + + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); + pBase += sizeof(simd16scalar); + } + + // transpose attribs + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t attrib = 0; attrib < numAttribs; ++attrib) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); + pBase += sizeof(simd16scalar); + } + } + + // transpose user clip distances if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); + pBase += sizeof(simd16scalar); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); + pBase += sizeof(simd16scalar); + } + } + + PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology); + + while (clipPa.GetNextStreamOutput()) + { + do + { + simd16vector attrib[NumVertsPerPrim]; + bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib); + + if (assemble) + { + static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff }; + + clipPa.useAlternateOffset = false; + pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]), _simd16_set1_epi32(pViewportIdx[inputPrim])); + } + + } while (clipPa.NextPrim()); + } + } + +#if defined(_DEBUG) + free(transposedPrims); + +#endif + // update global pipeline stat + UPDATE_STAT_FE(CPrimitives, numClippedPrims); + } + +#endif + // execute the clipper stage + void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) + { + SWR_ASSERT(pa.pDC != nullptr); + SWR_CONTEXT* pContext = pa.pDC->pContext; + + // set up binner based on PA state + PFN_PROCESS_PRIMS pfnBinner; + switch (pa.binTopology) + { + case TOP_POINT_LIST: + pfnBinner = BinPoints; + break; + case TOP_LINE_LIST: + case TOP_LINE_STRIP: + case TOP_LINE_LOOP: + case TOP_LINE_LIST_ADJ: + case TOP_LISTSTRIP_ADJ: + pfnBinner = BinLines; + break; + default: + pfnBinner = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0)); + break; + }; + + // update clipper invocations pipeline stat + uint32_t numInvoc = _mm_popcnt_u32(primMask); + UPDATE_STAT_FE(CInvocations, numInvoc); + + ComputeClipCodes(prim, viewportIdx); + + // cull prims with NAN coords + primMask &= ~ComputeNaNMask(prim); + + // user cull distance cull + if (this->state.rastState.cullDistanceMask) + { + primMask &= ~ComputeUserClipCullMask(pa, prim); + } + + // cull prims outside view frustum + simdscalar clipIntersection = ComputeClipCodeIntersection(); + int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps())); + + // skip clipping for points + uint32_t clipMask = 0; + if (NumVertsPerPrim != 1) + { + clipMask = primMask & ComputeClipMask(); + } + + if (clipMask) + { + AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); + // we have to clip tris, execute the clipper, which will also + // call the binner + ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx); + AR_END(FEGuardbandClip, 1); + } + else if (validMask) + { + // update CPrimitives pipeline state + UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); + + // forward valid prims directly to binner + pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx); + } + } + +#if USE_SIMD16_FRONTEND + void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx) + { + SWR_ASSERT(pa.pDC != nullptr); + SWR_CONTEXT* pContext = pa.pDC->pContext; + + // set up binner based on PA state + PFN_PROCESS_PRIMS_SIMD16 pfnBinner; + switch (pa.binTopology) + { + case TOP_POINT_LIST: + pfnBinner = BinPoints_simd16; + break; + case TOP_LINE_LIST: + case TOP_LINE_STRIP: + case TOP_LINE_LOOP: + case TOP_LINE_LIST_ADJ: + case TOP_LISTSTRIP_ADJ: + pfnBinner = BinLines_simd16; + break; + default: + pfnBinner = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0)); + break; + }; + + // update clipper invocations pipeline stat + uint32_t numInvoc = _mm_popcnt_u32(primMask); + UPDATE_STAT_FE(CInvocations, numInvoc); + + ComputeClipCodes(prim, viewportIdx); + + // cull prims with NAN coords + primMask &= ~ComputeNaNMask(prim); + + // user cull distance cull + if (this->state.rastState.cullDistanceMask) + { + primMask &= ~ComputeUserClipCullMask(pa, prim); + } + + // cull prims outside view frustum + simd16scalar clipIntersection = ComputeClipCodeIntersection_simd16(); + int validMask = primMask & _simd16_movemask_ps(_simd16_cmpeq_ps(clipIntersection, _simd16_setzero_ps())); + + // skip clipping for points + uint32_t clipMask = 0; + if (NumVertsPerPrim != 1) + { + clipMask = primMask & ComputeClipMask_simd16(); + } + + if (clipMask) + { + AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); + // we have to clip tris, execute the clipper, which will also + // call the binner + ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId, viewportIdx); + AR_END(FEGuardbandClip, 1); + } + else if (validMask) + { + // update CPrimitives pipeline state + UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); + + // forward valid prims directly to binner + pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx); + } + } + +#endif +private: + inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1) + { + return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1)); + } + +#if USE_SIMD16_FRONTEND + inline simd16scalar ComputeInterpFactor(simd16scalar boundaryCoord0, simd16scalar boundaryCoord1) + { + return _simd16_div_ps(boundaryCoord0, _simd16_sub_ps(boundaryCoord0, boundaryCoord1)); + } + +#endif + inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component) + { + const uint32_t simdVertexStride = sizeof(simdvertex); + const uint32_t componentStride = sizeof(simdscalar); + const uint32_t attribStride = sizeof(simdvector); + const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float), + 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float)); + + // step to the simdvertex + simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride)); // step to the attribute and component vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component)); @@ -611,6 +1116,31 @@ private: return vOffsets; } +#if USE_SIMD16_FRONTEND + inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari vIndices, uint32_t component) + { + const uint32_t simdVertexStride = sizeof(simd16vertex); + const uint32_t componentStride = sizeof(simd16scalar); + const uint32_t attribStride = sizeof(simd16vector); + const simd16scalari vElemOffset = _simd16_set_epi32( + 15 * sizeof(float), 14 * sizeof(float), 13 * sizeof(float), 12 * sizeof(float), + 11 * sizeof(float), 10 * sizeof(float), 9 * sizeof(float), 8 * sizeof(float), + 7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float), + 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float)); + + // step to the simdvertex + simd16scalari vOffsets = _simd16_mullo_epi32(vIndices, _simd16_set1_epi32(simdVertexStride)); + + // step to the attribute and component + vOffsets = _simd16_add_epi32(vOffsets, _simd16_set1_epi32(attribStride * attrib + componentStride * component)); + + // step to the lane + vOffsets = _simd16_add_epi32(vOffsets, vElemOffset); + + return vOffsets; + } + +#endif // gathers a single component for a given attribute for each SIMD lane inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component) { @@ -619,13 +1149,39 @@ private: return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1); } - inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc) +#if USE_SIMD16_FRONTEND + inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component) + { + simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component); + simd16scalar vSrc = _simd16_setzero_ps(); + return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1); + } + +#endif + inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc) + { + simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); + + uint32_t* pOffsets = (uint32_t*)&vOffsets; + float* pSrc = (float*)&vSrc; + uint32_t mask = _simd_movemask_ps(vMask); + DWORD lane; + while (_BitScanForward(&lane, mask)) + { + mask &= ~(1 << lane); + uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane]; + *(float*)pBuf = pSrc[lane]; + } + } + +#if USE_SIMD16_FRONTEND + inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component, simd16scalar vSrc) { - simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component); + simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component); uint32_t* pOffsets = (uint32_t*)&vOffsets; float* pSrc = (float*)&vSrc; - uint32_t mask = _simd_movemask_ps(vMask); + uint32_t mask = _simd16_movemask_ps(vMask); DWORD lane; while (_BitScanForward(&lane, mask)) { @@ -635,6 +1191,7 @@ private: } } +#endif template inline void intersect( const simdscalar& vActiveMask, // active lanes to operate on @@ -716,6 +1273,89 @@ private: } } +#if USE_SIMD16_FRONTEND + template + inline void intersect( + const simd16scalar& vActiveMask,// active lanes to operate on + const simd16scalari& s, // index to first edge vertex v0 in pInPts. + const simd16scalari& p, // index to second edge vertex v1 in pInPts. + const simd16vector& v1, // vertex 0 position + const simd16vector& v2, // vertex 1 position + simd16scalari& outIndex, // output index. + const float *pInVerts, // array of all the input positions. + uint32_t numInAttribs, // number of attributes per vertex. + float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. + { + // compute interpolation factor + simd16scalar t; + switch (ClippingPlane) + { + case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[0]), _simd16_add_ps(v2[3], v2[0])); break; + case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[0]), _simd16_sub_ps(v2[3], v2[0])); break; + case FRUSTUM_TOP: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[1]), _simd16_add_ps(v2[3], v2[1])); break; + case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[1]), _simd16_sub_ps(v2[3], v2[1])); break; + case FRUSTUM_NEAR: + // DX Znear plane is 0, GL is -w + if (this->state.rastState.clipHalfZ) + { + t = ComputeInterpFactor(v1[2], v2[2]); + } + else + { + t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[2]), _simd16_add_ps(v2[3], v2[2])); + } + break; + case FRUSTUM_FAR: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[2]), _simd16_sub_ps(v2[3], v2[2])); break; + default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane); + }; + + // interpolate position and store + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vOutPos = _simd16_fmadd_ps(_simd16_sub_ps(v2[c], v1[c]), t, v1[c]); + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); + } + + // interpolate attributes and store + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } + + // interpolate clip distance if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } + } + +#endif template inline simdscalar inside(const simdvector& v) { @@ -733,6 +1373,25 @@ private: } } +#if USE_SIMD16_FRONTEND + template + inline simd16scalar inside(const simd16vector& v) + { + switch (ClippingPlane) + { + case FRUSTUM_LEFT: return _simd16_cmpge_ps(v[0], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f))); + case FRUSTUM_RIGHT: return _simd16_cmple_ps(v[0], v[3]); + case FRUSTUM_TOP: return _simd16_cmpge_ps(v[1], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f))); + case FRUSTUM_BOTTOM: return _simd16_cmple_ps(v[1], v[3]); + case FRUSTUM_NEAR: return _simd16_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd16_setzero_ps() : _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f))); + case FRUSTUM_FAR: return _simd16_cmple_ps(v[2], v[3]); + default: + SWR_INVALID("invalid clipping plane: %d", ClippingPlane); + return _simd16_setzero_ps(); + } + } + +#endif template simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) { @@ -826,6 +1485,101 @@ private: return vOutIndex; } +#if USE_SIMD16_FRONTEND + template + simd16scalari ClipTriToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) + { + simd16scalari vCurIndex = _simd16_setzero_si(); + simd16scalari vOutIndex = _simd16_setzero_si(); + simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts)); + + while (!_simd16_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty + { + simd16scalari s = vCurIndex; + simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1)); + simd16scalari underFlowMask = _simd16_cmpgt_epi32(vNumInPts, p); + p = _simd16_castps_si(_simd16_blendv_ps(_simd16_setzero_ps(), _simd16_castsi_ps(p), _simd16_castsi_ps(underFlowMask))); + + // gather position + simd16vector vInPos0, vInPos1; + for (uint32_t c = 0; c < 4; ++c) + { + vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); + vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); + } + + // compute inside mask + simd16scalar s_in = inside(vInPos0); + simd16scalar p_in = inside(vInPos1); + + // compute intersection mask (s_in != p_in) + simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in); + intersectMask = _simd16_and_ps(intersectMask, vActiveMask); + + // store s if inside + s_in = _simd16_and_ps(s_in, vActiveMask); + if (!_simd16_testz_ps(s_in, s_in)) + { + // store position + for (uint32_t c = 0; c < 4; ++c) + { + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); + } + + // store attribs + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + // store clip distance if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + // increment outIndex + vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in); + } + + // compute and store intersection + if (!_simd16_testz_ps(intersectMask, intersectMask)) + { + intersect(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); + + // increment outIndex for active lanes + vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask); + } + + // increment loop index and update active mask + vCurIndex = _simd16_add_epi32(vCurIndex, _simd16_set1_epi32(1)); + vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts)); + } + + return vOutIndex; + } + +#endif template simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) { @@ -915,6 +1669,96 @@ private: return vOutIndex; } +#if USE_SIMD16_FRONTEND + template + simd16scalari ClipLineToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts) + { + simd16scalari vCurIndex = _simd16_setzero_si(); + simd16scalari vOutIndex = _simd16_setzero_si(); + simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts)); + + if (!_simd16_testz_ps(vActiveMask, vActiveMask)) + { + simd16scalari s = vCurIndex; + simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1)); + + // gather position + simd16vector vInPos0, vInPos1; + for (uint32_t c = 0; c < 4; ++c) + { + vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); + vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c); + } + + // compute inside mask + simd16scalar s_in = inside(vInPos0); + simd16scalar p_in = inside(vInPos1); + + // compute intersection mask (s_in != p_in) + simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in); + intersectMask = _simd16_and_ps(intersectMask, vActiveMask); + + // store s if inside + s_in = _simd16_and_ps(s_in, vActiveMask); + if (!_simd16_testz_ps(s_in, s_in)) + { + for (uint32_t c = 0; c < 4; ++c) + { + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]); + } + + // interpolate attributes and store + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + // increment outIndex + vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in); + } + + // compute and store intersection + if (!_simd16_testz_ps(intersectMask, intersectMask)) + { + intersect(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts); + + // increment outIndex for active lanes + vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask); + } + + // store p if inside + p_in = _simd16_and_ps(p_in, vActiveMask); + if (!_simd16_testz_ps(p_in, p_in)) + { + for (uint32_t c = 0; c < 4; ++c) + { + ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]); + } + + // interpolate attributes and store + for (uint32_t a = 0; a < numInAttribs; ++a) + { + uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a; + for (uint32_t c = 0; c < 4; ++c) + { + simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); + ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); + } + } + + // increment outIndex + vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), p_in); + } + } + + return vOutIndex; + } +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Vertical clipper. Clips SIMD primitives at a time /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer @@ -958,10 +1802,53 @@ private: return vNumOutPts; } +#if USE_SIMD16_FRONTEND + simd16scalari ClipPrims(float* pVertices, const simd16scalar& vPrimMask, const simd16scalar& vClipMask, int numAttribs) + { + // temp storage + float* pTempVerts = (float*)&tlsTempVertices_simd16[0]; + + // zero out num input verts for non-active lanes + simd16scalari vNumInPts = _simd16_set1_epi32(NumVertsPerPrim); + vNumInPts = _simd16_blendv_epi32(_simd16_setzero_si(), vNumInPts, vClipMask); + + // clip prims to frustum + simd16scalari vNumOutPts; + if (NumVertsPerPrim == 3) + { + vNumOutPts = ClipTriToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); + vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipTriToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipTriToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipTriToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); + } + else + { + SWR_ASSERT(NumVertsPerPrim == 2); + vNumOutPts = ClipLineToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); + vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipLineToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); + vNumOutPts = ClipLineToPlane(pVertices, vNumOutPts, numAttribs, pTempVerts); + vNumOutPts = ClipLineToPlane(pTempVerts, vNumOutPts, numAttribs, pVertices); + } + + // restore num verts for non-clipped, active lanes + simd16scalar vNonClippedMask = _simd16_andnot_ps(vClipMask, vPrimMask); + vNumOutPts = _simd16_blendv_epi32(vNumOutPts, _simd16_set1_epi32(NumVertsPerPrim), vNonClippedMask); + + return vNumOutPts; + } + +#endif const uint32_t workerId{ 0 }; DRAW_CONTEXT* pDC{ nullptr }; const API_STATE& state; simdscalar clipCodes[NumVertsPerPrim]; +#if USE_SIMD16_FRONTEND + simd16scalar clipCodes_simd16[NumVertsPerPrim]; +#endif }; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index e880ead71dc..938bc9855d1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -208,14 +208,34 @@ void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari v { // refer to calcDeterminantInt comment for calculation explanation // A1*B2 - simd16scalari vA1Lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 - simd16scalari vA1Hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 + +#if 1 + // TODO: get the native SIMD16 version working.. + + simdscalari vA_lo[3]; + simdscalari vA_hi[3]; + simdscalari vB_lo[3]; + simdscalari vB_hi[3]; + + for (uint32_t i = 0; i < 3; i += 1) + { + vA_lo[i] = _simd16_extract_si(vA[i], 0); + vA_hi[i] = _simd16_extract_si(vA[i], 1); + vB_lo[i] = _simd16_extract_si(vB[i], 0); + vB_hi[i] = _simd16_extract_si(vB[i], 1); + } + + calcDeterminantIntVertical(vA_lo, vB_lo, reinterpret_cast(&pvDet[0])); + calcDeterminantIntVertical(vA_hi, vB_hi, reinterpret_cast(&pvDet[1])); +#else + simd16scalari vA1Lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 8 8 9 9 C C D D + simd16scalari vA1Hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 A A B B E E F F simd16scalari vB2Lo = _simd16_unpacklo_epi32(vB[2], vB[2]); simd16scalari vB2Hi = _simd16_unpackhi_epi32(vB[2], vB[2]); - simd16scalari vA1B2Lo = _simd16_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 - simd16scalari vA1B2Hi = _simd16_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 + simd16scalari vA1B2Lo = _simd16_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 8 9 C D + simd16scalari vA1B2Hi = _simd16_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 A B E F // B1*A2 simd16scalari vA2Lo = _simd16_unpacklo_epi32(vA[2], vA[2]); @@ -237,6 +257,7 @@ void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari v pvDet[0] = vResultLo; pvDet[1] = vResultHi; +#endif } #endif -- 2.30.2