From: George Kyriazis Date: Wed, 7 Feb 2018 22:51:41 +0000 (-0600) Subject: swr/rast: Make SIMDLib templated types easier to use X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=37ebf86add77ebd5a3640904a09fe990b7a4e5c7;p=mesa.git swr/rast: Make SIMDLib templated types easier to use "typename SIMD_T::TypeName" --> "TypeName" Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp index 500cf8a87e3..4114645d92e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp @@ -571,3 +571,12 @@ struct SIMDBase : Traits::IsaImpl using SIMD128 = SIMDBase; using SIMD256 = SIMDBase; using SIMD512 = SIMDBase; + +template using CompareType = typename SIMD_T::CompareType; +template using ScaleFactor = typename SIMD_T::ScaleFactor; +template using RoundMode = typename SIMD_T::RoundMode; +template using Float = typename SIMD_T::Float; +template using Double = typename SIMD_T::Double; +template using Integer = typename SIMD_T::Integer; +template using Vec4 = typename SIMD_T::Vec4; +template using Mask = typename SIMD_T::Mask; diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 8447bc4dc29..3b093cefc04 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -41,23 +41,23 @@ void BinPostSetupLinesImpl( DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, - typename SIMD_T::Vec4 prim[], - typename SIMD_T::Float recipW[], + Vec4 prim[], + Float recipW[], uint32_t primMask, - typename SIMD_T::Integer const &primID, - typename SIMD_T::Integer const &viewportIdx, - typename SIMD_T::Integer const &rtIdx); + Integer const &primID, + Integer const &viewportIdx, + Integer const &rtIdx); template void BinPostSetupPointsImpl( DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, - typename SIMD_T::Vec4 prim[], + Vec4 prim[], uint32_t primMask, - typename SIMD_T::Integer const &primID, - typename SIMD_T::Integer const &viewportIdx, - typename SIMD_T::Integer const &rtIdx); + Integer const &primID, + Integer const &viewportIdx, + Integer const &rtIdx); ////////////////////////////////////////////////////////////////////////// /// @brief Processes attributes for the backend based on linkage mask and @@ -327,34 +327,34 @@ struct EarlyRastHelper template uint32_t SIMDCALL EarlyRasterizer( SIMDBBOX_T &er_bbox, - typename SIMD_T::Integer (&vAi)[3], - typename SIMD_T::Integer (&vBi)[3], - typename SIMD_T::Integer (&vXi)[3], - typename SIMD_T::Integer (&vYi)[3], + Integer (&vAi)[3], + Integer (&vBi)[3], + Integer (&vXi)[3], + Integer (&vYi)[3], uint32_t cwTrisMask, uint32_t triMask, uint32_t oneTileMask) { // step to pixel center of top-left pixel of the triangle bbox - typename SIMD_T::Integer vTopLeftX = SIMD_T::template slli_epi32(er_bbox.xmin); + Integer vTopLeftX = SIMD_T::template slli_epi32(er_bbox.xmin); vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2)); - typename SIMD_T::Integer vTopLeftY = SIMD_T::template slli_epi32(er_bbox.ymin); + Integer vTopLeftY = SIMD_T::template slli_epi32(er_bbox.ymin); vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2)); // negate A and B for CW tris - typename SIMD_T::Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1)); - typename SIMD_T::Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1)); - typename SIMD_T::Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1)); - typename SIMD_T::Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1)); - typename SIMD_T::Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1)); - typename SIMD_T::Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1)); + Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1)); + Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1)); + Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1)); + Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1)); + Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1)); + Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1)); RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0); - typename SIMD_T::Integer vShiftCntrl = EarlyRastHelper ::InitShiftCntrl(); - typename SIMD_T::Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask); - typename SIMD_T::Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl); + Integer vShiftCntrl = EarlyRastHelper ::InitShiftCntrl(); + Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask); + Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl); vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask))); vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask))); @@ -364,34 +364,34 @@ uint32_t SIMDCALL EarlyRasterizer( vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask))); // evaluate edge equations at top-left pixel - typename SIMD_T::Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]); - typename SIMD_T::Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]); - typename SIMD_T::Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]); + Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]); + Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]); + Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]); - typename SIMD_T::Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]); - typename SIMD_T::Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]); - typename SIMD_T::Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]); + Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]); + Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]); + Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]); - typename SIMD_T::Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0); - typename SIMD_T::Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1); - typename SIMD_T::Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2); + Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0); + Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1); + Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2); - typename SIMD_T::Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0); - typename SIMD_T::Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1); - typename SIMD_T::Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2); + Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0); + Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1); + Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2); - typename SIMD_T::Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0); - typename SIMD_T::Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1); - typename SIMD_T::Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2); + Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0); + Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1); + Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2); vEdge0 = SIMD_T::template srai_epi32(vEdge0); vEdge1 = SIMD_T::template srai_epi32(vEdge1); vEdge2 = SIMD_T::template srai_epi32(vEdge2); // top left rule - typename SIMD_T::Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1)); - typename SIMD_T::Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1)); - typename SIMD_T::Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1)); + Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1)); + Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1)); + Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1)); // vA < 0 vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0]))); @@ -399,9 +399,9 @@ uint32_t SIMDCALL EarlyRasterizer( vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2]))); // vA == 0 && vB < 0 - typename SIMD_T::Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si()); - typename SIMD_T::Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si()); - typename SIMD_T::Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si()); + Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si()); + Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si()); + Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si()); vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]); vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]); @@ -415,28 +415,28 @@ uint32_t SIMDCALL EarlyRasterizer( #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4 // Go down // coverage pixel 0 - typename SIMD_T::Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1); + Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1); vMask0 = SIMD_T::and_si(vMask0, vEdge2); // coverage pixel 1 - typename SIMD_T::Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]); - typename SIMD_T::Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]); - typename SIMD_T::Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]); - typename SIMD_T::Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); + Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]); + Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]); + Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]); + Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask1 = SIMD_T::and_si(vMask1, vEdge2N); // coverage pixel 2 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - typename SIMD_T::Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N); + Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask2 = SIMD_T::and_si(vMask2, vEdge2N); // coverage pixel 3 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]); - typename SIMD_T::Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N); + Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask3 = SIMD_T::and_si(vMask3, vEdge2N); // One step to the right and then up @@ -445,31 +445,31 @@ uint32_t SIMDCALL EarlyRasterizer( vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]); vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]); vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]); - typename SIMD_T::Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N); + Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask4 = SIMD_T::and_si(vMask4, vEdge2N); // coverage pixel 5 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - typename SIMD_T::Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N); + Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask5 = SIMD_T::and_si(vMask5, vEdge2N); // coverage pixel 6 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - typename SIMD_T::Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N); + Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask6 = SIMD_T::and_si(vMask6, vEdge2N); // coverage pixel 7 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]); vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]); vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]); - typename SIMD_T::Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); + Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask7 = SIMD_T::and_si(vMask7, vEdge2N); - typename SIMD_T::Integer vLit1 = SIMD_T::or_si(vMask0, vMask1); + Integer vLit1 = SIMD_T::or_si(vMask0, vMask1); vLit1 = SIMD_T::or_si(vLit1, vMask2); vLit1 = SIMD_T::or_si(vLit1, vMask3); vLit1 = SIMD_T::or_si(vLit1, vMask4); @@ -537,7 +537,7 @@ uint32_t SIMDCALL EarlyRasterizer( vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N); vMask7 = SIMD_T::and_si(vMask7, vEdge2N); - typename SIMD_T::Integer vLit2 = SIMD_T::or_si(vMask0, vMask1); + Integer vLit2 = SIMD_T::or_si(vMask0, vMask1); vLit2 = SIMD_T::or_si(vLit2, vMask2); vLit2 = SIMD_T::or_si(vLit2, vMask3); vLit2 = SIMD_T::or_si(vLit2, vMask4); @@ -545,24 +545,24 @@ uint32_t SIMDCALL EarlyRasterizer( vLit2 = SIMD_T::or_si(vLit2, vMask6); vLit2 = SIMD_T::or_si(vLit2, vMask7); - typename SIMD_T::Integer vLit = SIMD_T::or_si(vLit1, vLit2); + Integer vLit = SIMD_T::or_si(vLit1, vLit2); #else // Generic algorithm sweeping in row by row order - typename SIMD_T::Integer vRowMask[ER_SIMD_TILE_Y_DIM]; + Integer vRowMask[ER_SIMD_TILE_Y_DIM]; - typename SIMD_T::Integer vEdge0N = vEdge0; - typename SIMD_T::Integer vEdge1N = vEdge1; - typename SIMD_T::Integer vEdge2N = vEdge2; + Integer vEdge0N = vEdge0; + Integer vEdge1N = vEdge1; + Integer vEdge2N = vEdge2; for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++) { // Store edge values at the beginning of the row - typename SIMD_T::Integer vRowEdge0 = vEdge0N; - typename SIMD_T::Integer vRowEdge1 = vEdge1N; - typename SIMD_T::Integer vRowEdge2 = vEdge2N; + Integer vRowEdge0 = vEdge0N; + Integer vRowEdge1 = vEdge1N; + Integer vRowEdge2 = vEdge2N; - typename SIMD_T::Integer vColMask[ER_SIMD_TILE_X_DIM]; + Integer vColMask[ER_SIMD_TILE_X_DIM]; for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++) { @@ -589,7 +589,7 @@ uint32_t SIMDCALL EarlyRasterizer( } // compress all masks - typename SIMD_T::Integer vLit = vRowMask[0]; + Integer vLit = vRowMask[0]; for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++) { vLit = SIMD_T::or_si(vLit, vRowMask[row]); @@ -627,11 +627,11 @@ void SIMDCALL BinTrianglesImpl( DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, - typename SIMD_T::Vec4 tri[3], + Vec4 tri[3], uint32_t triMask, - typename SIMD_T::Integer const &primID, - typename SIMD_T::Integer const &viewportIdx, - typename SIMD_T::Integer const &rtIdx) + Integer const &primID, + Integer const &viewportIdx, + Integer const &rtIdx) { const uint32_t *aRTAI = reinterpret_cast(&rtIdx); @@ -643,9 +643,9 @@ void SIMDCALL BinTrianglesImpl( MacroTileMgr *pTileMgr = pDC->pTileMgr; - typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f); - typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f); - typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f); + Float vRecipW0 = SIMD_T::set1_ps(1.0f); + Float vRecipW1 = SIMD_T::set1_ps(1.0f); + Float vRecipW2 = SIMD_T::set1_ps(1.0f); if (feState.vpTransformDisable) { @@ -685,7 +685,7 @@ void SIMDCALL BinTrianglesImpl( } // Adjust for pixel center location - typename SIMD_T::Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); + Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); tri[0].x = SIMD_T::add_ps(tri[0].x, offset); tri[0].y = SIMD_T::add_ps(tri[0].y, offset); @@ -697,15 +697,15 @@ void SIMDCALL BinTrianglesImpl( tri[2].y = SIMD_T::add_ps(tri[2].y, offset); // Set vXi, vYi to required fixed point precision - typename SIMD_T::Integer vXi[3], vYi[3]; + Integer vXi[3], vYi[3]; FPToFixedPoint(tri, vXi, vYi); // triangle setup - typename SIMD_T::Integer vAi[3], vBi[3]; + Integer vAi[3], vBi[3]; triangleSetupABIntVertical(vXi, vYi, vAi, vBi); // determinant - typename SIMD_T::Integer vDet[2]; + Integer vDet[2]; calcDeterminantIntVertical(vAi, vBi, vDet); // cull zero area @@ -774,14 +774,14 @@ void SIMDCALL BinTrianglesImpl( if (cullZeroAreaMask > 0) { // e0 = v1-v0 - const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]); - const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]); + const Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]); + const Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]); uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask))); // e1 = v2-v1 - const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]); - const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]); + const Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]); + const Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]); uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask))); @@ -836,19 +836,19 @@ void SIMDCALL BinTrianglesImpl( int cullCenterMask; { - typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127)); + Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127)); xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255)); - typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128)); + Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128)); xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255)); - typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax); + Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax); - typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127)); + Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127)); ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255)); - typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128)); + Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128)); ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255)); - typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax); + Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax); vMaskV = SIMD_T::or_si(vMaskH, vMaskV); cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV)); @@ -866,7 +866,7 @@ void SIMDCALL BinTrianglesImpl( // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. { - typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; + Integer scisXmin, scisYmin, scisXmax, scisYmax; if (pa.viewportArrayActive) { @@ -895,18 +895,18 @@ void SIMDCALL BinTrianglesImpl( // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has // some area. Bump the xmax/ymax edges out - typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax); + Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax); bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom); - typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax); + Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax); bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight); } // Cull tris completely outside scissor { - typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); - typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); + Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); + Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); triMask = triMask & ~maskOutsideScissor; } @@ -924,8 +924,8 @@ void SIMDCALL BinTrianglesImpl( er_bbox.ymin = SIMD_T::template srai_epi32(bbox.ymin); er_bbox.ymax = SIMD_T::template srai_epi32(bbox.ymax); - typename SIMD_T::Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax); - typename SIMD_T::Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax); + Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax); + Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax); // Take only triangles that fit into ER tile uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY))); @@ -958,8 +958,8 @@ endBinTriangles: { // Simple non-conformant wireframe mode, useful for debugging // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD - typename SIMD_T::Vec4 line[2]; - typename SIMD_T::Float recipW[2]; + Vec4 line[2]; + Float recipW[2]; line[0] = tri[0]; line[1] = tri[1]; @@ -1004,10 +1004,10 @@ endBinTriangles: OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax); + SIMD_T::store_si(reinterpret_cast *>(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast *>(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast *>(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast *>(aMTBottom), bbox.ymax); // transpose verts needed for backend /// @todo modify BE to take non-transformed verts @@ -1173,15 +1173,15 @@ void BinPostSetupPointsImpl( DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, - typename SIMD_T::Vec4 prim[], + Vec4 prim[], uint32_t primMask, - typename SIMD_T::Integer const &primID, - typename SIMD_T::Integer const &viewportIdx, - typename SIMD_T::Integer const &rtIdx) + Integer const &primID, + Integer const &viewportIdx, + Integer const &rtIdx) { RDTSC_BEGIN(FEBinPoints, pDC->drawId); - typename SIMD_T::Vec4 &primVerts = prim[0]; + Vec4 &primVerts = prim[0]; const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; @@ -1192,7 +1192,7 @@ void BinPostSetupPointsImpl( state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); // convert to fixed point - typename SIMD_T::Integer vXi, vYi; + Integer vXi, vYi; vXi = fpToFixedPointVertical(primVerts.x); vYi = fpToFixedPointVertical(primVerts.y); @@ -1208,36 +1208,36 @@ void BinPostSetupPointsImpl( primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi)); // compute macro tile coordinates - typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32(vXi); - typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32(vYi); + Integer macroX = SIMD_T::template srai_epi32(vXi); + Integer macroY = SIMD_T::template srai_epi32(vYi); OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast(aMacroX), macroX); - SIMD_T::store_si(reinterpret_cast(aMacroY), macroY); + SIMD_T::store_si(reinterpret_cast *>(aMacroX), macroX); + SIMD_T::store_si(reinterpret_cast *>(aMacroY), macroY); // compute raster tile coordinates - typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32(vXi); - typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32(vYi); + Integer rasterX = SIMD_T::template srai_epi32(vXi); + Integer rasterY = SIMD_T::template srai_epi32(vYi); // compute raster tile relative x,y for coverage mask - typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32(rasterX); - typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32(rasterY); + Integer tileAlignedX = SIMD_T::template slli_epi32(rasterX); + Integer tileAlignedY = SIMD_T::template slli_epi32(rasterY); - typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vXi), tileAlignedX); - typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vYi), tileAlignedY); + Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vXi), tileAlignedX); + Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32(vYi), tileAlignedY); OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH]; OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast(aTileRelativeX), tileRelativeX); - SIMD_T::store_si(reinterpret_cast(aTileRelativeY), tileRelativeY); + SIMD_T::store_si(reinterpret_cast *>(aTileRelativeX), tileRelativeX); + SIMD_T::store_si(reinterpret_cast *>(aTileRelativeY), tileRelativeY); OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH]; OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast(aTileAlignedX), tileAlignedX); - SIMD_T::store_si(reinterpret_cast(aTileAlignedY), tileAlignedY); + SIMD_T::store_si(reinterpret_cast *>(aTileAlignedX), tileAlignedX); + SIMD_T::store_si(reinterpret_cast *>(aTileAlignedY), tileAlignedY); OSALIGNSIMD16(float) aZ[SIMD_WIDTH]; SIMD_T::store_ps(reinterpret_cast(aZ), primVerts.z); @@ -1307,11 +1307,11 @@ void BinPostSetupPointsImpl( else { // non simple points need to be potentially binned to multiple macro tiles - typename SIMD_T::Float vPointSize; + Float vPointSize; if (rastState.pointParam) { - typename SIMD_T::Vec4 size[3]; + Vec4 size[3]; pa.Assemble(VERTEX_SGV_SLOT, size); vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP]; } @@ -1326,8 +1326,8 @@ void BinPostSetupPointsImpl( bbox.xmin = bbox.xmax = vXi; bbox.ymin = bbox.ymax = vYi; - typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f)); - typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f)); + Integer vHalfWidthi = fpToFixedPointVertical(vHalfWidth); bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi); bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi); @@ -1338,7 +1338,7 @@ void BinPostSetupPointsImpl( // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. { - typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; + Integer scisXmin, scisYmin, scisXmax, scisYmax; if (pa.viewportArrayActive) { @@ -1359,9 +1359,9 @@ void BinPostSetupPointsImpl( } // Cull bloated points completely outside scissor - typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); - typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); + Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); + Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); primMask = primMask & ~maskOutsideScissor; @@ -1373,10 +1373,10 @@ void BinPostSetupPointsImpl( OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax); + SIMD_T::store_si(reinterpret_cast *>(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast *>(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast *>(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast *>(aMTBottom), bbox.ymax); // store render target array index const uint32_t *aRTAI = reinterpret_cast(&rtIdx); @@ -1477,11 +1477,11 @@ void BinPointsImpl( DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, - typename SIMD_T::Vec4 prim[3], + Vec4 prim[3], uint32_t primMask, - typename SIMD_T::Integer const &primID, - typename SIMD_T::Integer const &viewportIdx, - typename SIMD_T::Integer const &rtIdx) + Integer const &primID, + Integer const &viewportIdx, + Integer const &rtIdx) { const API_STATE& state = GetApiState(pDC); const SWR_FRONTEND_STATE& feState = state.frontendState; @@ -1490,7 +1490,7 @@ void BinPointsImpl( if (!feState.vpTransformDisable) { // perspective divide - typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w); + Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w); prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0); prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0); @@ -1507,7 +1507,7 @@ void BinPointsImpl( } } - typename SIMD_T::Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); + Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); prim[0].x = SIMD_T::add_ps(prim[0].x, offset); prim[0].y = SIMD_T::add_ps(prim[0].y, offset); @@ -1580,12 +1580,12 @@ void BinPostSetupLinesImpl( DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, - typename SIMD_T::Vec4 prim[], - typename SIMD_T::Float recipW[], + Vec4 prim[], + Float recipW[], uint32_t primMask, - typename SIMD_T::Integer const &primID, - typename SIMD_T::Integer const &viewportIdx, - typename SIMD_T::Integer const &rtIdx) + Integer const &primID, + Integer const &viewportIdx, + Integer const &rtIdx) { const uint32_t *aRTAI = reinterpret_cast(&rtIdx); @@ -1598,11 +1598,11 @@ void BinPostSetupLinesImpl( PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - typename SIMD_T::Float &vRecipW0 = recipW[0]; - typename SIMD_T::Float &vRecipW1 = recipW[1]; + Float &vRecipW0 = recipW[0]; + Float &vRecipW1 = recipW[1]; // convert to fixed point - typename SIMD_T::Integer vXi[2], vYi[2]; + Integer vXi[2], vYi[2]; vXi[0] = fpToFixedPointVertical(prim[0].x); vYi[0] = fpToFixedPointVertical(prim[0].y); @@ -1610,13 +1610,13 @@ void BinPostSetupLinesImpl( vYi[1] = fpToFixedPointVertical(prim[1].y); // compute x-major vs y-major mask - typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1])); - typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1])); - typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength)); + Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1])); + Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1])); + Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength)); uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask); // cull zero-length lines - typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si()); + Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si()); vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si())); primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask)); @@ -1632,8 +1632,8 @@ void BinPostSetupLinesImpl( bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]); // bloat bbox by line width along minor axis - typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f); - typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f); + Integer vHalfWidthi = fpToFixedPointVertical(vHalfWidth); SIMDBBOX_T bloatBox; @@ -1649,7 +1649,7 @@ void BinPostSetupLinesImpl( // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. { - typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; + Integer scisXmin, scisYmin, scisXmax, scisYmax; if (pa.viewportArrayActive) { @@ -1671,9 +1671,9 @@ void BinPostSetupLinesImpl( // Cull prims completely outside scissor { - typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); - typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); - typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); + Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); + Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); primMask = primMask & ~maskOutsideScissor; } @@ -1698,10 +1698,10 @@ void BinPostSetupLinesImpl( OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - SIMD_T::store_si(reinterpret_cast(aMTLeft), bbox.xmin); - SIMD_T::store_si(reinterpret_cast(aMTRight), bbox.xmax); - SIMD_T::store_si(reinterpret_cast(aMTTop), bbox.ymin); - SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax); + SIMD_T::store_si(reinterpret_cast *>(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast *>(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast *>(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast *>(aMTBottom), bbox.ymax); TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps()); TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps()); @@ -1786,17 +1786,17 @@ void SIMDCALL BinLinesImpl( DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, - typename SIMD_T::Vec4 prim[3], + Vec4 prim[3], uint32_t primMask, - typename SIMD_T::Integer const &primID, - typename SIMD_T::Integer const &viewportIdx, - typename SIMD_T::Integer const & rtIdx) + Integer const &primID, + Integer const &viewportIdx, + Integer const & rtIdx) { const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; const SWR_FRONTEND_STATE& feState = state.frontendState; - typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) }; + Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) }; if (!feState.vpTransformDisable) { @@ -1825,7 +1825,7 @@ void SIMDCALL BinLinesImpl( } // adjust for pixel center location - typename SIMD_T::Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); + Float offset = SwrPixelOffsets::GetOffset(rastState.pixelLocation); prim[0].x = SIMD_T::add_ps(prim[0].x, offset); prim[0].y = SIMD_T::add_ps(prim[0].y, offset); diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h index 8d252350723..443dac57fef 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.h +++ b/src/gallium/drivers/swr/rasterizer/core/binner.h @@ -38,7 +38,7 @@ template struct SwrPixelOffsets { public: - INLINE static typename SIMD_T::Float GetOffset(uint32_t loc) + INLINE static Float GetOffset(uint32_t loc) { SWR_ASSERT(loc <= 1); @@ -50,7 +50,7 @@ public: /// @brief Convert the X,Y coords of a triangle to the requested Fixed /// Point precision from FP32. template > -INLINE typename SIMD_T::Integer fpToFixedPointVertical(const typename SIMD_T::Float &vIn) +INLINE Integer fpToFixedPointVertical(const Float &vIn) { return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value))); } @@ -62,7 +62,7 @@ INLINE typename SIMD_T::Integer fpToFixedPointVertical(const typename SIMD_T::Fl /// @param vXi: fixed point X coords of tri verts /// @param vYi: fixed point Y coords of tri verts template -INLINE static void FPToFixedPoint(const typename SIMD_T::Vec4 *const tri, typename SIMD_T::Integer(&vXi)[3], typename SIMD_T::Integer(&vYi)[3]) +INLINE static void FPToFixedPoint(const Vec4 *const tri, Integer(&vXi)[3], Integer(&vYi)[3]) { vXi[0] = fpToFixedPointVertical(tri[0].x); vYi[0] = fpToFixedPointVertical(tri[0].y); @@ -81,24 +81,24 @@ INLINE static void FPToFixedPoint(const typename SIMD_T::Vec4 *const tri, typena /// *Note*: expects vX, vY to be in the correct precision for the type /// of rasterization. This avoids unnecessary FP->fixed conversions. template -INLINE void calcBoundingBoxIntVertical(const typename SIMD_T::Integer(&vX)[3], const typename SIMD_T::Integer(&vY)[3], SIMDBBOX_T &bbox) +INLINE void calcBoundingBoxIntVertical(const Integer(&vX)[3], const Integer(&vY)[3], SIMDBBOX_T &bbox) { - typename SIMD_T::Integer vMinX = vX[0]; + Integer vMinX = vX[0]; vMinX = SIMD_T::min_epi32(vMinX, vX[1]); vMinX = SIMD_T::min_epi32(vMinX, vX[2]); - typename SIMD_T::Integer vMaxX = vX[0]; + Integer vMaxX = vX[0]; vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]); vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]); - typename SIMD_T::Integer vMinY = vY[0]; + Integer vMinY = vY[0]; vMinY = SIMD_T::min_epi32(vMinY, vY[1]); vMinY = SIMD_T::min_epi32(vMinY, vY[2]); - typename SIMD_T::Integer vMaxY = vY[0]; + Integer vMaxY = vY[0]; vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]); vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]); @@ -108,7 +108,7 @@ INLINE void calcBoundingBoxIntVertical(const typename SIMD_T::Integer(&vX)[3], c /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. - const typename SIMD_T::Integer value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value); + const Integer value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value); vMinX = SIMD_T::sub_epi32(vMinX, value); vMaxX = SIMD_T::add_epi32(vMaxX, value); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index ddee3b1a940..8d2590a4981 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -62,15 +62,15 @@ enum SWR_CLIPCODES #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW) template -void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes) +void ComputeClipCodes(const API_STATE &state, const Vec4 &vertex, Float &clipCodes, Integer const &viewportIndexes) { clipCodes = SIMD_T::setzero_ps(); // -w - typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f)); + Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f)); // FRUSTUM_LEFT - typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW); + Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW); clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT))); // FRUSTUM_TOP @@ -109,22 +109,22 @@ void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &verte clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW)))); // GUARDBAND_LEFT - typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps(&state.gbState.left[0], viewportIndexes)); + Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps(4)>(&state.gbState.left[0], viewportIndexes)); vRes = SIMD_T::cmplt_ps(vertex.x, gbMult); clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT)))); // GUARDBAND_TOP - gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps(&state.gbState.top[0], viewportIndexes)); + gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps(4)>(&state.gbState.top[0], viewportIndexes)); vRes = SIMD_T::cmplt_ps(vertex.y, gbMult); clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP)))); // GUARDBAND_RIGHT - gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps(&state.gbState.right[0], viewportIndexes)); + gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps(4)>(&state.gbState.right[0], viewportIndexes)); vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult); clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT)))); // GUARDBAND_BOTTOM - gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps(&state.gbState.bottom[0], viewportIndexes)); + gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps(4)>(&state.gbState.bottom[0], viewportIndexes)); vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult); clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM)))); } @@ -311,7 +311,7 @@ public: static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim"); } - void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes) + void ComputeClipCodes(Vec4 vertex[], const Integer &viewportIndexes) { for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { @@ -319,9 +319,9 @@ public: } } - typename SIMD_T::Float ComputeClipCodeIntersection() + Float ComputeClipCodeIntersection() { - typename SIMD_T::Float result = clipCodes[0]; + Float result = clipCodes[0]; for (uint32_t i = 1; i < NumVertsPerPrim; ++i) { @@ -331,9 +331,9 @@ public: return result; } - typename SIMD_T::Float ComputeClipCodeUnion() + Float ComputeClipCodeUnion() { - typename SIMD_T::Float result = clipCodes[0]; + Float result = clipCodes[0]; for (uint32_t i = 1; i < NumVertsPerPrim; ++i) { @@ -345,7 +345,7 @@ public: int ComputeClipMask() { - typename SIMD_T::Float clipUnion = ComputeClipCodeUnion(); + Float clipUnion = ComputeClipCodeUnion(); clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK))); @@ -353,31 +353,31 @@ public: } // clipper is responsible for culling any prims with NAN coordinates - int ComputeNaNMask(typename SIMD_T::Vec4 prim[]) + int ComputeNaNMask(Vec4 prim[]) { - typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps(); + Float vNanMask = SIMD_T::setzero_ps(); for (uint32_t e = 0; e < NumVertsPerPrim; ++e) { - typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps(prim[e].v[0], prim[e].v[1]); + Float vNan01 = SIMD_T::template cmp_ps(prim[e].v[0], prim[e].v[1]); vNanMask = SIMD_T::or_ps(vNanMask, vNan01); - typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps(prim[e].v[2], prim[e].v[3]); + Float vNan23 = SIMD_T::template cmp_ps(prim[e].v[2], prim[e].v[3]); vNanMask = SIMD_T::or_ps(vNanMask, vNan23); } return SIMD_T::movemask_ps(vNanMask); } - int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[]) + int ComputeUserClipCullMask(PA_STATE &pa, Vec4 prim[]) { uint8_t cullMask = state.backendState.cullDistanceMask; uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset; - typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps(); + Float vClipCullMask = SIMD_T::setzero_ps(); - typename SIMD_T::Vec4 vClipCullDistLo[3]; - typename SIMD_T::Vec4 vClipCullDistHi[3]; + Vec4 vClipCullDistLo[3]; + Vec4 vClipCullDistHi[3]; pa.Assemble(vertexClipCullOffset, vClipCullDistLo); pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi); @@ -389,10 +389,10 @@ public: uint32_t slot = index >> 2; uint32_t component = index & 0x3; - typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f); + Float vCullMaskElem = SIMD_T::set1_ps(-1.0f); for (uint32_t e = 0; e < NumVertsPerPrim; ++e) { - typename SIMD_T::Float vCullComp; + Float vCullComp; if (slot == 0) { vCullComp = vClipCullDistLo[e][component]; @@ -403,7 +403,7 @@ public: } // cull if cull distance < 0 || NAN - typename SIMD_T::Float vCull = SIMD_T::template cmp_ps(SIMD_T::setzero_ps(), vCullComp); + Float vCull = SIMD_T::template cmp_ps(SIMD_T::setzero_ps(), vCullComp); vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull); } vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem); @@ -417,10 +417,10 @@ public: uint32_t slot = index >> 2; uint32_t component = index & 0x3; - typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f); + Float vCullMaskElem = SIMD_T::set1_ps(-1.0f); for (uint32_t e = 0; e < NumVertsPerPrim; ++e) { - typename SIMD_T::Float vClipComp; + Float vClipComp; if (slot == 0) { vClipComp = vClipCullDistLo[e][component]; @@ -430,8 +430,8 @@ public: vClipComp = vClipCullDistHi[e][component]; } - typename SIMD_T::Float vClip = SIMD_T::template cmp_ps(vClipComp, vClipComp); - typename SIMD_T::Float vCull = SIMD_T::template cmp_ps(SIMD_T::setzero_ps(), vClipComp); + Float vClip = SIMD_T::template cmp_ps(vClipComp, vClipComp); + Float vCull = SIMD_T::template cmp_ps(SIMD_T::setzero_ps(), vClipComp); vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull); vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip); } @@ -441,8 +441,8 @@ public: return SIMD_T::movemask_ps(vClipCullMask); } - void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa, - const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx) + void ClipSimd(const Vec4 prim[], const Float &vPrimMask, const Float &vClipMask, PA_STATE &pa, + const Integer &vPrimId, const Integer &vViewportIdx, const Integer &vRtIdx) { // input/output vertex store for clipper SIMDVERTEX_T vertices[7]; // maximum 7 verts generated per triangle @@ -456,7 +456,7 @@ public: ///@todo: line topology for wireframe? // assemble pos - typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim]; + Vec4 tmpVector[NumVertsPerPrim]; for (uint32_t i = 0; i < NumVertsPerPrim; ++i) { vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i]; @@ -515,7 +515,7 @@ public: uint32_t numAttribs = maxSlot + 1; - typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); + Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); BinnerChooser binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast); @@ -602,9 +602,9 @@ public: #endif for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); + SIMD256::Float temp = SIMD256::template mask_i32gather_ps(1)>(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(typename SIMD_T::Float); + pBase += sizeof(Float); } // transpose attribs @@ -616,9 +616,9 @@ public: for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); + SIMD256::Float temp = SIMD256::template mask_i32gather_ps(1)>(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); transposedPrims[0].attrib[attribSlot][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(typename SIMD_T::Float); + pBase += sizeof(Float); } } @@ -630,9 +630,9 @@ public: for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); + SIMD256::Float temp = SIMD256::template mask_i32gather_ps(1)>(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(typename SIMD_T::Float); + pBase += sizeof(Float); } } @@ -642,9 +642,9 @@ public: for (uint32_t c = 0; c < 4; ++c) { - SIMD256::Float temp = SIMD256::template mask_i32gather_ps(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); + SIMD256::Float temp = SIMD256::template mask_i32gather_ps(1)>(SIMD256::setzero_ps(), reinterpret_cast(pBase), vOffsets, vMask); transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper::insert_lo_ps(temp); - pBase += sizeof(typename SIMD_T::Float); + pBase += sizeof(Float); } } @@ -656,16 +656,16 @@ public: const uint32_t primMask = primMaskMap[numEmittedPrims]; - const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]); - const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]); - const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]); + const Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]); + const Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]); + const Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]); while (clipPA.GetNextStreamOutput()) { do { - typename SIMD_T::Vec4 attrib[NumVertsPerPrim]; + Vec4 attrib[NumVertsPerPrim]; bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib); @@ -686,8 +686,8 @@ public: UPDATE_STAT_FE(CPrimitives, numClippedPrims); } - void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask, - typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx) + void ExecuteStage(PA_STATE &pa, Vec4 prim[], uint32_t primMask, + Integer const &primId, Integer const &viewportIdx, Integer const &rtIdx) { SWR_ASSERT(pa.pDC != nullptr); @@ -709,7 +709,7 @@ public: } // cull prims outside view frustum - typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection(); + Float clipIntersection = ComputeClipCodeIntersection(); int validMask = primMask & SimdHelper::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps()); // skip clipping for points @@ -740,16 +740,16 @@ public: } private: - typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1) + Float ComputeInterpFactor(Float const &boundaryCoord0, Float const &boundaryCoord1) { return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1)); } - typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component) + Integer ComputeOffsets(uint32_t attrib, Integer const &vIndices, uint32_t component) { const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T); - const uint32_t componentStride = sizeof(typename SIMD_T::Float); - const uint32_t attribStride = sizeof(typename SIMD_T::Vec4); + const uint32_t componentStride = sizeof(Float); + const uint32_t attribStride = sizeof(Vec4); static const OSALIGNSIMD16(uint32_t) elemOffset[16] = { @@ -771,12 +771,12 @@ private: 15 * sizeof(float), }; - static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets."); + static_assert(sizeof(Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets."); - typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast(elemOffset)); + Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast *>(elemOffset)); // step to the simdvertex - typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride)); + Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride)); // step to the attribute and component vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component)); @@ -787,17 +787,17 @@ private: return vOffsets; } - typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component) + Float GatherComponent(const float* pBuffer, uint32_t attrib, Float const &vMask, Integer const &vIndices, uint32_t component) { - typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component); - typename SIMD_T::Float vSrc = SIMD_T::setzero_ps(); + Integer vOffsets = ComputeOffsets(attrib, vIndices, component); + Float vSrc = SIMD_T::setzero_ps(); - return SIMD_T::template mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask); + return SIMD_T::template mask_i32gather_ps(1)>(vSrc, pBuffer, vOffsets, vMask); } - void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc) + void ScatterComponent(const float* pBuffer, uint32_t attrib, Float const &vMask, Integer const &vIndices, uint32_t component, Float const &vSrc) { - typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component); + Integer vOffsets = ComputeOffsets(attrib, vIndices, component); const uint32_t *pOffsets = reinterpret_cast(&vOffsets); const float *pSrc = reinterpret_cast(&vSrc); @@ -813,12 +813,12 @@ private: template void intersect( - const typename SIMD_T::Float &vActiveMask, // active lanes to operate on - const typename SIMD_T::Integer &s, // index to first edge vertex v0 in pInPts. - const typename SIMD_T::Integer &p, // index to second edge vertex v1 in pInPts. - const typename SIMD_T::Vec4 &v1, // vertex 0 position - const typename SIMD_T::Vec4 &v2, // vertex 1 position - typename SIMD_T::Integer &outIndex, // output index. + const Float &vActiveMask, // active lanes to operate on + const Integer &s, // index to first edge vertex v0 in pInPts. + const Integer &p, // index to second edge vertex v1 in pInPts. + const Vec4 &v1, // vertex 0 position + const Vec4 &v2, // vertex 1 position + Integer &outIndex, // output index. const float *pInVerts, // array of all the input positions. uint32_t numInAttribs, // number of attributes per vertex. float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4. @@ -827,7 +827,7 @@ private: uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset; // compute interpolation factor - typename SIMD_T::Float t; + Float t; switch (ClippingPlane) { case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break; @@ -852,7 +852,7 @@ private: // interpolate position and store for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]); + Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]); ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos); } @@ -862,9 +862,9 @@ private: uint32_t attribSlot = vertexAttribOffset + a; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); - typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } @@ -875,9 +875,9 @@ private: uint32_t attribSlot = vertexClipCullOffset; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); - typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } @@ -887,16 +887,16 @@ private: uint32_t attribSlot = vertexClipCullOffset + 1; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); - typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); - typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0); ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } } template - typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v) + Float inside(const Vec4 &v) { switch (ClippingPlane) { @@ -913,23 +913,23 @@ private: } template - typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts) + Integer ClipTriToPlane(const float *pInVerts, const Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts) { uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; - typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si(); - typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si(); - typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); + Integer vCurIndex = SIMD_T::setzero_si(); + Integer vOutIndex = SIMD_T::setzero_si(); + Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty { - typename SIMD_T::Integer s = vCurIndex; - typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); - typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p); + Integer s = vCurIndex; + Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); + Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p); p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask))); // gather position - typename SIMD_T::Vec4 vInPos0, vInPos1; + Vec4 vInPos0, vInPos1; for (uint32_t c = 0; c < 4; ++c) { vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); @@ -937,11 +937,11 @@ private: } // compute inside mask - typename SIMD_T::Float s_in = inside(vInPos0); - typename SIMD_T::Float p_in = inside(vInPos1); + Float s_in = inside(vInPos0); + Float p_in = inside(vInPos1); // compute intersection mask (s_in != p_in) - typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in); + Float intersectMask = SIMD_T::xor_ps(s_in, p_in); intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); // store s if inside @@ -960,7 +960,7 @@ private: uint32_t attribSlot = vertexAttribOffset + a; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); } } @@ -972,7 +972,7 @@ private: uint32_t attribSlot = vertexClipCullSlot; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); } } @@ -982,7 +982,7 @@ private: uint32_t attribSlot = vertexClipCullSlot + 1; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); } } @@ -1009,21 +1009,21 @@ private: } template - typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts) + Integer ClipLineToPlane(const float *pInVerts, const Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts) { uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset; - typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si(); - typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si(); - typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); + Integer vCurIndex = SIMD_T::setzero_si(); + Integer vOutIndex = SIMD_T::setzero_si(); + Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts)); if (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) { - typename SIMD_T::Integer s = vCurIndex; - typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); + Integer s = vCurIndex; + Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1)); // gather position - typename SIMD_T::Vec4 vInPos0, vInPos1; + Vec4 vInPos0, vInPos1; for (uint32_t c = 0; c < 4; ++c) { vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c); @@ -1031,11 +1031,11 @@ private: } // compute inside mask - typename SIMD_T::Float s_in = inside(vInPos0); - typename SIMD_T::Float p_in = inside(vInPos1); + Float s_in = inside(vInPos0); + Float p_in = inside(vInPos1); // compute intersection mask (s_in != p_in) - typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in); + Float intersectMask = SIMD_T::xor_ps(s_in, p_in); intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask); // store s if inside @@ -1053,7 +1053,7 @@ private: uint32_t attribSlot = vertexAttribOffset + a; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); } } @@ -1086,7 +1086,7 @@ private: uint32_t attribSlot = vertexAttribOffset + a; for (uint32_t c = 0; c < 4; ++c) { - typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); + Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c); ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib); } } @@ -1099,17 +1099,17 @@ private: return vOutIndex; } - typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs) + Integer ClipPrims(float *pVertices, const Float &vPrimMask, const Float &vClipMask, int numAttribs) { // temp storage float *pTempVerts = reinterpret_cast(ClipHelper::GetTempVertices()); // zero out num input verts for non-active lanes - typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim); + Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim); vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask); // clip prims to frustum - typename SIMD_T::Integer vNumOutPts; + Integer vNumOutPts; if (NumVertsPerPrim == 3) { vNumOutPts = ClipTriToPlane(pVertices, vNumInPts, numAttribs, pTempVerts); @@ -1131,7 +1131,7 @@ private: } // restore num verts for non-clipped, active lanes - typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask); + Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask); vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask); return vNumOutPts; @@ -1140,7 +1140,7 @@ private: const uint32_t workerId{ 0 }; DRAW_CONTEXT *pDC{ nullptr }; const API_STATE &state; - typename SIMD_T::Float clipCodes[NumVertsPerPrim]; + Float clipCodes[NumVertsPerPrim]; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 13c9f3670f7..1c4b522e45e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -733,7 +733,7 @@ template void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) { uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; - uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4; + uint32_t dstVertexStride = numAttribs * sizeof(Float) * 4; OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; @@ -741,7 +741,7 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t { gatherOffsets[i] = srcVertexStride * i; } - auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]); + auto vGatherOffsets = SIMD_T::load_si((Integer*)&gatherOffsets[0]); uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; uint32_t remainingVerts = numVerts; @@ -759,18 +759,18 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t for (uint32_t a = 0; a < numAttribs; ++a) { - auto attribGatherX = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); - auto attribGatherY = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); - auto attribGatherZ = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); - auto attribGatherW = SIMD_T::template mask_i32gather_ps(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask); + auto attribGatherX = SIMD_T::template mask_i32gather_ps(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); + auto attribGatherY = SIMD_T::template mask_i32gather_ps(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); + auto attribGatherZ = SIMD_T::template mask_i32gather_ps(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); + auto attribGatherW = SIMD_T::template mask_i32gather_ps(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask); SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); - SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY); - SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ); - SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW); + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float)), viMask, attribGatherY); + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float) * 2), viMask, attribGatherZ); + SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float) * 3), viMask, attribGatherW); pSrcBase += sizeof(float) * 4; - pDstBase += sizeof(typename SIMD_T::Float) * 4; + pDstBase += sizeof(Float) * 4; } remainingVerts -= SimdWidth; } @@ -1101,7 +1101,7 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, // Allocate storage for transposed GS output uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH; - uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4); + uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(Vec4); pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32); // Allocate storage to hold temporary stream->cut buffer, if necessary