From ada27b503eab3c53d9ec1bca2cef48c5353e81f9 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 13 Oct 2016 12:30:34 -0500 Subject: [PATCH] swr: [rasterizer core] move binner functionality to separate file Signed-off-by: Tim Rowley --- src/gallium/drivers/swr/Makefile.sources | 1 + .../drivers/swr/rasterizer/core/binner.cpp | 1442 +++++++++++++++++ .../drivers/swr/rasterizer/core/frontend.cpp | 1393 +--------------- 3 files changed, 1444 insertions(+), 1392 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/core/binner.cpp diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index 0ade8467178..d81d458d3e8 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -70,6 +70,7 @@ CORE_CXX_SOURCES := \ rasterizer/core/arena.h \ rasterizer/core/backend.cpp \ rasterizer/core/backend.h \ + rasterizer/core/binner.cpp \ rasterizer/core/blend.h \ rasterizer/core/clip.cpp \ rasterizer/core/clip.h \ diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp new file mode 100644 index 00000000000..75ddce95ad7 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -0,0 +1,1442 @@ +/**************************************************************************** +* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file binner.cpp +* +* @brief Implementation for the macrotile binner +* +******************************************************************************/ + +#include "context.h" +#include "frontend.h" +#include "conservativeRast.h" +#include "pa.h" +#include "rasterizer.h" +#include "rdtsc_core.h" +#include "tilemgr.h" + +////////////////////////////////////////////////////////////////////////// +/// @brief Offsets added to post-viewport vertex positions based on +/// raster state. +static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = +{ + _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER + _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Convert the X,Y coords of a triangle to the requested Fixed +/// Point precision from FP32. +template > +INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn) +{ + simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value)); + return _simd_cvtps_epi32(vFixed); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Helper function to set the X,Y coords of a triangle to the +/// requested Fixed Point precision from FP32. +/// @param tri: simdvector[3] of FP triangle verts +/// @param vXi: fixed point X coords of tri verts +/// @param vYi: fixed point Y coords of tri verts +INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3]) +{ + vXi[0] = fpToFixedPointVertical(tri[0].x); + vYi[0] = fpToFixedPointVertical(tri[0].y); + vXi[1] = fpToFixedPointVertical(tri[1].x); + vYi[1] = fpToFixedPointVertical(tri[1].y); + vXi[2] = fpToFixedPointVertical(tri[2].x); + vYi[2] = fpToFixedPointVertical(tri[2].y); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Calculate bounding box for current triangle +/// @tparam CT: ConservativeRastFETraits type +/// @param vX: fixed point X position for triangle verts +/// @param vY: fixed point Y position for triangle verts +/// @param bbox: fixed point bbox +/// *Note*: expects vX, vY to be in the correct precision for the type +/// of rasterization. This avoids unnecessary FP->fixed conversions. +template +INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) +{ + simdscalari vMinX = vX[0]; + vMinX = _simd_min_epi32(vMinX, vX[1]); + vMinX = _simd_min_epi32(vMinX, vX[2]); + + simdscalari vMaxX = vX[0]; + vMaxX = _simd_max_epi32(vMaxX, vX[1]); + vMaxX = _simd_max_epi32(vMaxX, vX[2]); + + simdscalari vMinY = vY[0]; + vMinY = _simd_min_epi32(vMinY, vY[1]); + vMinY = _simd_min_epi32(vMinY, vY[2]); + + simdscalari vMaxY = vY[0]; + vMaxY = _simd_max_epi32(vMaxY, vY[1]); + vMaxY = _simd_max_epi32(vMaxY, vY[2]); + + bbox.xmin = vMinX; + bbox.xmax = vMaxX; + bbox.ymin = vMinY; + bbox.ymax = vMaxY; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical +/// Offsets BBox for conservative rast +template <> +INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) +{ + // FE conservative rast traits + typedef FEConservativeRastT CT; + + simdscalari vMinX = vX[0]; + vMinX = _simd_min_epi32(vMinX, vX[1]); + vMinX = _simd_min_epi32(vMinX, vX[2]); + + simdscalari vMaxX = vX[0]; + vMaxX = _simd_max_epi32(vMaxX, vX[1]); + vMaxX = _simd_max_epi32(vMaxX, vX[2]); + + simdscalari vMinY = vY[0]; + vMinY = _simd_min_epi32(vMinY, vY[1]); + vMinY = _simd_min_epi32(vMinY, vY[2]); + + simdscalari vMaxY = vY[0]; + vMaxY = _simd_max_epi32(vMaxY, vY[1]); + vMaxY = _simd_max_epi32(vMaxY, vY[2]); + + /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization + /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. + bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); + bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); + bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); + bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Processes attributes for the backend based on linkage mask and +/// linkage map. Essentially just doing an SOA->AOS conversion and pack. +/// @param pDC - Draw context +/// @param pa - Primitive Assembly state +/// @param linkageMask - Specifies which VS outputs are routed to PS. +/// @param pLinkageMap - maps VS attribute slot to PS slot +/// @param triIndex - Triangle to process attributes for +/// @param pBuffer - Output result +template +INLINE void ProcessAttributes( + DRAW_CONTEXT *pDC, + PA_STATE&pa, + uint32_t triIndex, + uint32_t primId, + float *pBuffer) +{ + static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT"); + const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; + // Conservative Rasterization requires degenerate tris to have constant attribute interpolation + LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask; + const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; + const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology; + + static const float constTable[3][4] = { + { 0.0f, 0.0f, 0.0f, 0.0f }, + { 0.0f, 0.0f, 0.0f, 1.0f }, + { 1.0f, 1.0f, 1.0f, 1.0f } + }; + + for (uint32_t i = 0; i < backendState.numAttributes; ++i) + { + uint32_t inputSlot; + if (IsSwizzledT::value) + { + SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i]; + inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib; + + } + else + { + inputSlot = VERTEX_ATTRIB_START_SLOT + i; + } + + __m128 attrib[3]; // triangle attribs (always 4 wide) + float* pAttribStart = pBuffer; + + if (HasConstantInterpT::value || IsDegenerate::value) + { + if (_bittest(&constantInterpMask, i)) + { + uint32_t vid; + uint32_t adjustedTriIndex; + static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 }; + static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } }; + static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } }; + static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } }; + static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } }; + + switch (topo) { + case TOP_QUAD_LIST: + adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex]; + vid = quadProvokingVertex[triIndex & 1][provokingVertex]; + break; + case TOP_QUAD_STRIP: + adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex]; + vid = qstripProvokingVertex[triIndex & 1][provokingVertex]; + break; + case TOP_TRIANGLE_STRIP: + adjustedTriIndex = triIndex; + vid = (triIndex & 1) + ? tristripProvokingVertex[provokingVertex] + : provokingVertex; + break; + default: + adjustedTriIndex = triIndex; + vid = provokingVertex; + break; + } + + pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib); + + for (uint32_t i = 0; i < NumVertsT::value; ++i) + { + _mm_store_ps(pBuffer, attrib[vid]); + pBuffer += 4; + } + } + else + { + pa.AssembleSingle(inputSlot, triIndex, attrib); + + for (uint32_t i = 0; i < NumVertsT::value; ++i) + { + _mm_store_ps(pBuffer, attrib[i]); + pBuffer += 4; + } + } + } + else + { + pa.AssembleSingle(inputSlot, triIndex, attrib); + + for (uint32_t i = 0; i < NumVertsT::value; ++i) + { + _mm_store_ps(pBuffer, attrib[i]); + pBuffer += 4; + } + } + + // pad out the attrib buffer to 3 verts to ensure the triangle + // interpolation code in the pixel shader works correctly for the + // 3 topologies - point, line, tri. This effectively zeros out the + // effect of the missing vertices in the triangle interpolation. + for (uint32_t v = NumVertsT::value; v < 3; ++v) + { + _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]); + pBuffer += 4; + } + + // check for constant source overrides + if (IsSwizzledT::value) + { + uint32_t mask = backendState.swizzleMap[i].componentOverrideMask; + if (mask) + { + DWORD comp; + while (_BitScanForward(&comp, mask)) + { + mask &= ~(1 << comp); + + float constantValue = 0.0f; + switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource) + { + case SWR_CONSTANT_SOURCE_CONST_0000: + case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT: + case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT: + constantValue = constTable[backendState.swizzleMap[i].constantSource][comp]; + break; + case SWR_CONSTANT_SOURCE_PRIM_ID: + constantValue = *(float*)&primId; + break; + } + + // apply constant value to all 3 vertices + for (uint32_t v = 0; v < 3; ++v) + { + pAttribStart[comp + v * 4] = constantValue; + } + } + } + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Gather scissor rect data based on per-prim viewport indices. +/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. +/// @param pViewportIndex - array of per-primitive vewport indexes. +/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data. +/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data. +/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data. +/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. +// +/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. +template +struct GatherScissors +{ + static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, + simdscalari &scisXmin, simdscalari &scisYmin, + simdscalari &scisXmax, simdscalari &scisYmax) + { + SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather"); + } +}; + +template<> +struct GatherScissors<8> +{ + static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, + simdscalari &scisXmin, simdscalari &scisYmin, + simdscalari &scisXmax, simdscalari &scisYmax) + { + scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, + pScissorsInFixedPoint[pViewportIndex[1]].xmin, + pScissorsInFixedPoint[pViewportIndex[2]].xmin, + pScissorsInFixedPoint[pViewportIndex[3]].xmin, + pScissorsInFixedPoint[pViewportIndex[4]].xmin, + pScissorsInFixedPoint[pViewportIndex[5]].xmin, + pScissorsInFixedPoint[pViewportIndex[6]].xmin, + pScissorsInFixedPoint[pViewportIndex[7]].xmin); + scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, + pScissorsInFixedPoint[pViewportIndex[1]].ymin, + pScissorsInFixedPoint[pViewportIndex[2]].ymin, + pScissorsInFixedPoint[pViewportIndex[3]].ymin, + pScissorsInFixedPoint[pViewportIndex[4]].ymin, + pScissorsInFixedPoint[pViewportIndex[5]].ymin, + pScissorsInFixedPoint[pViewportIndex[6]].ymin, + pScissorsInFixedPoint[pViewportIndex[7]].ymin); + scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, + pScissorsInFixedPoint[pViewportIndex[1]].xmax, + pScissorsInFixedPoint[pViewportIndex[2]].xmax, + pScissorsInFixedPoint[pViewportIndex[3]].xmax, + pScissorsInFixedPoint[pViewportIndex[4]].xmax, + pScissorsInFixedPoint[pViewportIndex[5]].xmax, + pScissorsInFixedPoint[pViewportIndex[6]].xmax, + pScissorsInFixedPoint[pViewportIndex[7]].xmax); + scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, + pScissorsInFixedPoint[pViewportIndex[1]].ymax, + pScissorsInFixedPoint[pViewportIndex[2]].ymax, + pScissorsInFixedPoint[pViewportIndex[3]].ymax, + pScissorsInFixedPoint[pViewportIndex[4]].ymax, + pScissorsInFixedPoint[pViewportIndex[5]].ymax, + pScissorsInFixedPoint[pViewportIndex[6]].ymax, + pScissorsInFixedPoint[pViewportIndex[7]].ymax); + } +}; + +typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); + +struct ProcessAttributesChooser +{ + typedef PFN_PROCESS_ATTRIBUTES FuncType; + + template + static FuncType GetFunc() + { + return ProcessAttributes; + } +}; + +PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false) +{ + return TemplateArgUnroller::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Processes enabled user clip distances. Loads the active clip +/// distances from the PA, sets up barycentric equations, and +/// stores the results to the output buffer +/// @param pa - Primitive Assembly state +/// @param primIndex - primitive index to process +/// @param clipDistMask - mask of enabled clip distances +/// @param pUserClipBuffer - buffer to store results +template +void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer) +{ + DWORD clipDist; + while (_BitScanForward(&clipDist, clipDistMask)) + { + clipDistMask &= ~(1 << clipDist); + uint32_t clipSlot = clipDist >> 2; + uint32_t clipComp = clipDist & 0x3; + uint32_t clipAttribSlot = clipSlot == 0 ? + VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; + + __m128 primClipDist[3]; + pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); + + float vertClipDist[NumVerts]; + for (uint32_t e = 0; e < NumVerts; ++e) + { + OSALIGNSIMD(float) aVertClipDist[4]; + _mm_store_ps(aVertClipDist, primClipDist[e]); + vertClipDist[e] = aVertClipDist[clipComp]; + }; + + // setup plane equations for barycentric interpolation in the backend + float baryCoeff[NumVerts]; + for (uint32_t e = 0; e < NumVerts - 1; ++e) + { + baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1]; + } + baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1]; + + for (uint32_t e = 0; e < NumVerts; ++e) + { + *(pUserClipBuffer++) = baryCoeff[e]; + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping +/// culling, viewport transform, etc. +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains triangle position data for SIMDs worth of triangles. +/// @param primID - Primitive ID for each triangle. +/// @param viewportIdx - viewport array index for each triangle. +/// @tparam CT - ConservativeRastFETraits +template +void BinTriangles( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector tri[3], + uint32_t triMask, + simdscalari primID, + simdscalari viewportIdx) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(FEBinTriangles, pDC->drawId); + + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + MacroTileMgr *pTileMgr = pDC->pTileMgr; + + // Simple non-conformant wireframe mode, useful for debugging + if (rastState.fillMode == SWR_FILLMODE_WIREFRAME) + { + // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD + simdvector line[2]; + line[0] = tri[0]; + line[1] = tri[1]; + BinLines(pDC, pa, workerId, line, triMask, primID, viewportIdx); + + line[0] = tri[1]; + line[1] = tri[2]; + BinLines(pDC, pa, workerId, line, triMask, primID, viewportIdx); + + line[0] = tri[2]; + line[1] = tri[0]; + BinLines(pDC, pa, workerId, line, triMask, primID, viewportIdx); + + AR_END(FEBinTriangles, 1); + return; + } + + simdscalar vRecipW0 = _simd_set1_ps(1.0f); + simdscalar vRecipW1 = _simd_set1_ps(1.0f); + simdscalar vRecipW2 = _simd_set1_ps(1.0f); + + if (feState.vpTransformDisable) + { + // RHW is passed in directly when VP transform is disabled + vRecipW0 = tri[0].v[3]; + vRecipW1 = tri[1].v[3]; + vRecipW2 = tri[2].v[3]; + } + else + { + // Perspective divide + vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); + vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); + vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); + + tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); + tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); + tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); + + tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); + tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); + tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); + + tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); + tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); + tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); + + // Viewport transform to screen space coords + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<3>(tri, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<3>(tri, state.vpMatrices); + } + } + + // Adjust for pixel center location + simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; + tri[0].x = _simd_add_ps(tri[0].x, offset); + tri[0].y = _simd_add_ps(tri[0].y, offset); + + tri[1].x = _simd_add_ps(tri[1].x, offset); + tri[1].y = _simd_add_ps(tri[1].y, offset); + + tri[2].x = _simd_add_ps(tri[2].x, offset); + tri[2].y = _simd_add_ps(tri[2].y, offset); + + simdscalari vXi[3], vYi[3]; + // Set vXi, vYi to required fixed point precision + FPToFixedPoint(tri, vXi, vYi); + + // triangle setup + simdscalari vAi[3], vBi[3]; + triangleSetupABIntVertical(vXi, vYi, vAi, vBi); + + // determinant + simdscalari vDet[2]; + calcDeterminantIntVertical(vAi, vBi, vDet); + + // cull zero area + int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); + int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); + + int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); + + uint32_t origTriMask = triMask; + // don't cull degenerate triangles if we're conservatively rasterizing + if (!CT::IsConservativeT::value) + { + triMask &= ~cullZeroAreaMask; + } + + // determine front winding tris + // CW +det + // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast + maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); + maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); + int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); + + uint32_t frontWindingTris; + if (rastState.frontWinding == SWR_FRONTWINDING_CW) + { + frontWindingTris = cwTriMask; + } + else + { + frontWindingTris = ~cwTriMask; + } + + // cull + uint32_t cullTris; + switch ((SWR_CULLMODE)rastState.cullMode) + { + case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; + case SWR_CULLMODE_NONE: cullTris = 0x0; break; + case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; + // 0 area triangles are marked as backfacing, which is required behavior for conservative rast + case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; + default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; + } + + triMask &= ~cullTris; + + if (origTriMask ^ triMask) + { + RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); + } + + /// Note: these variable initializations must stay above any 'goto endBenTriangles' + // compute per tri backface + uint32_t frontFaceMask = frontWindingTris; + uint32_t *pPrimID = (uint32_t *)&primID; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + DWORD triIndex = 0; + // for center sample pattern, all samples are at pixel center; calculate coverage + // once at center and broadcast the results in the backend + const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; + uint32_t edgeEnable; + PFN_WORK_FUNC pfnWork; + if (CT::IsConservativeT::value) + { + // determine which edges of the degenerate tri, if any, are valid to rasterize. + // used to call the appropriate templated rasterizer function + if (cullZeroAreaMask > 0) + { + // e0 = v1-v0 + simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]); + simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]); + uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask))); + + // e1 = v2-v1 + simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]); + simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]); + uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask))); + + // e2 = v0-v2 + // if v0 == v1 & v1 == v2, v0 == v2 + uint32_t e2Mask = e0Mask & e1Mask; + SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512"); + + // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 + // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 + e0Mask = pdep_u32(e0Mask, 0x00249249); + // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 + e1Mask = pdep_u32(e1Mask, 0x00492492); + // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 + e2Mask = pdep_u32(e2Mask, 0x00924924); + + edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask))); + } + else + { + edgeEnable = 0x00FFFFFF; + } + } + else + { + // degenerate triangles won't be sent to rasterizer; just enable all edges + pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, + (state.scissorsTileAligned == false)); + } + + if (!triMask) + { + goto endBinTriangles; + } + + // Calc bounding box of triangles + simdBBox bbox; + calcBoundingBoxIntVertical(tri, vXi, vYi, bbox); + + // determine if triangle falls between pixel centers and discard + // only discard for non-MSAA case and when conservative rast is disabled + // (xmin + 127) & ~255 + // (xmax + 128) & ~255 + if (rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value)) + { + origTriMask = triMask; + + int cullCenterMask; + { + simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127)); + xmin = _simd_and_si(xmin, _simd_set1_epi32(~255)); + simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128)); + xmax = _simd_and_si(xmax, _simd_set1_epi32(~255)); + + simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax); + + simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127)); + ymin = _simd_and_si(ymin, _simd_set1_epi32(~255)); + simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128)); + ymax = _simd_and_si(ymax, _simd_set1_epi32(~255)); + + simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax); + vMaskV = _simd_or_si(vMaskH, vMaskV); + cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); + } + + triMask &= ~cullCenterMask; + + if (origTriMask ^ triMask) + { + RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); + } + } + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. + // Gather the AOS effective scissor rects based on the per-prim VP index. + /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + + if (CT::IsConservativeT::value) + { + // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has + // some area. Bump the xmax/ymax edges out + simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax); + bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom); + simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax); + bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight); + } + + // Cull tris completely outside scissor + { + simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); + simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); + simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + triMask = triMask & ~maskOutsideScissor; + } + + if (!triMask) + { + goto endBinTriangles; + } + + // Convert triangle bbox to macrotile units. + bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); + _simd_store_si((simdscalari*)aMTRight, bbox.xmax); + _simd_store_si((simdscalari*)aMTTop, bbox.ymin); + _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); + + // transpose verts needed for backend + /// @todo modify BE to take non-transformed verts + __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; + vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); + vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); + vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); + vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai[3]; + pa.Assemble(VERTEX_RTAI_SLOT, vRtai); + simdscalari vRtaii; + vRtaii = _simd_castps_si(vRtai[0].x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + // scan remaining valid triangles and bin each separately + while (_BitScanForward(&triIndex, triMask)) + { + uint32_t linkageCount = state.backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + bool isDegenerate; + if (CT::IsConservativeT::value) + { + // only rasterize valid edges if we have a degenerate primitive + int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; + work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, + (state.scissorsTileAligned == false)); + + // Degenerate triangles are required to be constant interpolated + isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; + } + else + { + isDegenerate = false; + work.pfnWork = pfnWork; + } + + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate); + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); + desc.triFlags.primID = pPrimID[triIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; + desc.triFlags.viewportIndex = pViewportIndex[triIndex]; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.pAttribs = pAttribs; + desc.numAttribs = linkageCount; + pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); + + // store triangle vertex data + desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); + + _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); + _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); + _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); + _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); + ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); + } + + for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) + { + for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + triMask &= ~(1 << triIndex); + } + +endBinTriangles: + AR_END(FEBinTriangles, 1); +} + +struct FEBinTrianglesChooser +{ + typedef PFN_PROCESS_PRIMS FuncType; + + template + static FuncType GetFunc() + { + return BinTriangles>; + } +}; + +// Selector for correct templated BinTrinagles function +PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) +{ + return TemplateArgUnroller::GetFunc(IsConservative); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Bin SIMD points to the backend. Only supports point size of 1 +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains point position data for SIMDs worth of points. +/// @param primID - Primitive ID for each point. +void BinPoints( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prim[3], + uint32_t primMask, + simdscalari primID, + simdscalari viewportIdx) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(FEBinPoints, pDC->drawId); + + simdvector& primVerts = prim[0]; + + const API_STATE& state = GetApiState(pDC); + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + const SWR_RASTSTATE& rastState = state.rastState; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + + if (!feState.vpTransformDisable) + { + // perspective divide + simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); + primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); + primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); + primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); + + // viewport transform to screen coords + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<1>(&primVerts, state.vpMatrices); + } + } + + // adjust for pixel center location + simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; + primVerts.x = _simd_add_ps(primVerts.x, offset); + primVerts.y = _simd_add_ps(primVerts.y, offset); + + // convert to fixed point + simdscalari vXi, vYi; + vXi = fpToFixedPointVertical(primVerts.x); + vYi = fpToFixedPointVertical(primVerts.y); + + if (CanUseSimplePoints(pDC)) + { + // adjust for ymin-xmin rule + vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); + vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); + + // cull points off the ymin-xmin edge of the viewport + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); + + // compute macro tile coordinates + simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMacroX, macroX); + _simd_store_si((simdscalari*)aMacroY, macroY); + + // compute raster tile coordinates + simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + + // compute raster tile relative x,y for coverage mask + simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); + simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); + + simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); + simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); + + OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); + _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); + + OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); + _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); + + OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; + _simd_store_ps((float*)aZ, primVerts.z); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai; + pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai.x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + uint32_t *pPrimID = (uint32_t *)&primID; + DWORD primIndex = 0; + + const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; + + // scan remaining valid triangles and bin each separately + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + // points are always front facing + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + + work.pfnWork = RasterizeSimplePoint; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store attributes + float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); + desc.pAttribs = pAttribs; + desc.numAttribs = linkageCount; + + pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); + + // store raster tile aligned x, y, perspective correct z + float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; + *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; + *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; + *pTriBuffer = aZ[primIndex]; + + uint32_t tX = aTileRelativeX[primIndex]; + uint32_t tY = aTileRelativeY[primIndex]; + + // pack the relative x,y into the coverageMask, the rasterizer will + // generate the true coverage mask from it + work.desc.tri.triFlags.coverageMask = tX | (tY << 4); + + // bin it + MacroTileMgr *pTileMgr = pDC->pTileMgr; +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); + } + primMask &= ~(1 << primIndex); + } + } + else + { + // non simple points need to be potentially binned to multiple macro tiles + simdscalar vPointSize; + if (rastState.pointParam) + { + simdvector size[3]; + pa.Assemble(VERTEX_POINT_SIZE_SLOT, size); + vPointSize = size[0].x; + } + else + { + vPointSize = _simd_set1_ps(rastState.pointSize); + } + + // bloat point to bbox + simdBBox bbox; + bbox.xmin = bbox.xmax = vXi; + bbox.ymin = bbox.ymax = vYi; + + simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); + simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); + bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); + bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); + bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. + // Gather the AOS effective scissor rects based on the per-prim VP index. + /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + + // Cull bloated points completely outside scissor + simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); + simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); + simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + primMask = primMask & ~maskOutsideScissor; + + // Convert bbox to macrotile units. + bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); + _simd_store_si((simdscalari*)aMTRight, bbox.xmax); + _simd_store_si((simdscalari*)aMTTop, bbox.ymin); + _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai[2]; + pa.Assemble(VERTEX_RTAI_SLOT, vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai[0].x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; + _simd_store_ps((float*)aPointSize, vPointSize); + + uint32_t *pPrimID = (uint32_t *)&primID; + + OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; + + _simd_store_ps((float*)aPrimVertsX, primVerts.x); + _simd_store_ps((float*)aPrimVertsY, primVerts.y); + _simd_store_ps((float*)aPrimVertsZ, primVerts.z); + + // scan remaining valid prims and bin each separately + const SWR_BACKEND_STATE& backendState = state.backendState; + DWORD primIndex; + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.pointSize = aPointSize[primIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + + work.pfnWork = RasterizeTriPoint; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.numAttribs = linkageCount; + pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); + + // store point vertex data + float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; + *pTriBuffer++ = aPrimVertsX[primIndex]; + *pTriBuffer++ = aPrimVertsY[primIndex]; + *pTriBuffer = aPrimVertsZ[primIndex]; + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); + ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); + } + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) + { + for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + primMask &= ~(1 << primIndex); + } + } + + AR_END(FEBinPoints, 1); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Bin SIMD lines to the backend. +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains line position data for SIMDs worth of points. +/// @param primID - Primitive ID for each line. +/// @param viewportIdx - Viewport Array Index for each line. +void BinLines( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prim[], + uint32_t primMask, + simdscalari primID, + simdscalari viewportIdx) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(FEBinLines, pDC->drawId); + + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + + simdscalar vRecipW0 = _simd_set1_ps(1.0f); + simdscalar vRecipW1 = _simd_set1_ps(1.0f); + + if (!feState.vpTransformDisable) + { + // perspective divide + vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); + vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); + + prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0); + prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1); + + prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0); + prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1); + + prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0); + prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1); + + // viewport transform to screen coords + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<2>(prim, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<2>(prim, state.vpMatrices); + } + } + + // adjust for pixel center location + simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; + prim[0].x = _simd_add_ps(prim[0].x, offset); + prim[0].y = _simd_add_ps(prim[0].y, offset); + + prim[1].x = _simd_add_ps(prim[1].x, offset); + prim[1].y = _simd_add_ps(prim[1].y, offset); + + // convert to fixed point + simdscalari vXi[2], vYi[2]; + vXi[0] = fpToFixedPointVertical(prim[0].x); + vYi[0] = fpToFixedPointVertical(prim[0].y); + vXi[1] = fpToFixedPointVertical(prim[1].x); + vYi[1] = fpToFixedPointVertical(prim[1].y); + + // compute x-major vs y-major mask + simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); + simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); + simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); + uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); + + // cull zero-length lines + simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); + vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); + + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); + + uint32_t *pPrimID = (uint32_t *)&primID; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + + simdscalar vUnused = _simd_setzero_ps(); + + // Calc bounding box of lines + simdBBox bbox; + bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]); + bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]); + bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]); + bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]); + + // bloat bbox by line width along minor axis + simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); + simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + simdBBox bloatBox; + bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); + bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); + bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); + bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); + + bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); + bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); + bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); + bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + + // Cull prims completely outside scissor + { + simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); + simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); + simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + primMask = primMask & ~maskOutsideScissor; + } + + if (!primMask) + { + goto endBinLines; + } + + // Convert triangle bbox to macrotile units. + bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); + _simd_store_si((simdscalari*)aMTRight, bbox.xmax); + _simd_store_si((simdscalari*)aMTTop, bbox.ymin); + _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); + + // transpose verts needed for backend + /// @todo modify BE to take non-transformed verts + __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; + vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); + vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); + vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); + vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai[2]; + pa.Assemble(VERTEX_RTAI_SLOT, vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai[0].x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + // scan remaining valid prims and bin each separately + DWORD primIndex; + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = state.backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + + work.pfnWork = RasterizeLine; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.numAttribs = linkageCount; + pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); + + // store line vertex data + desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); + _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); + _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); + _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); + _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); + ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); + } + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) + { + for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + primMask &= ~(1 << primIndex); + } + +endBinLines: + + AR_END(FEBinLines, 1); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 2bcb4e3fbf5..bc8a1dae37e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -32,8 +32,6 @@ #include "backend.h" #include "context.h" #include "rdtsc_core.h" -#include "rasterizer.h" -#include "conservativeRast.h" #include "utils.h" #include "threads.h" #include "pa.h" @@ -50,15 +48,6 @@ static INLINE uint32_t GenMask(uint32_t numBits) return ((1U << numBits) - 1); } -////////////////////////////////////////////////////////////////////////// -/// @brief Offsets added to post-viewport vertex positions based on -/// raster state. -static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = -{ - _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER - _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL -}; - ////////////////////////////////////////////////////////////////////////// /// @brief FE handler for SwrSync. /// @param pContext - pointer to SWR context. @@ -495,70 +484,6 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) return _simd_castps_si(vMask(mask)); } - -////////////////////////////////////////////////////////////////////////// -/// @brief Gather scissor rect data based on per-prim viewport indices. -/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. -/// @param pViewportIndex - array of per-primitive vewport indexes. -/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data. -/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data. -/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data. -/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. -// -/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. -template -struct GatherScissors -{ - static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, - simdscalari &scisXmin, simdscalari &scisYmin, - simdscalari &scisXmax, simdscalari &scisYmax) - { - SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather"); - } -}; - -template<> -struct GatherScissors<8> -{ - static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, - simdscalari &scisXmin, simdscalari &scisYmin, - simdscalari &scisXmax, simdscalari &scisYmax) - { - scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, - pScissorsInFixedPoint[pViewportIndex[1]].xmin, - pScissorsInFixedPoint[pViewportIndex[2]].xmin, - pScissorsInFixedPoint[pViewportIndex[3]].xmin, - pScissorsInFixedPoint[pViewportIndex[4]].xmin, - pScissorsInFixedPoint[pViewportIndex[5]].xmin, - pScissorsInFixedPoint[pViewportIndex[6]].xmin, - pScissorsInFixedPoint[pViewportIndex[7]].xmin); - scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, - pScissorsInFixedPoint[pViewportIndex[1]].ymin, - pScissorsInFixedPoint[pViewportIndex[2]].ymin, - pScissorsInFixedPoint[pViewportIndex[3]].ymin, - pScissorsInFixedPoint[pViewportIndex[4]].ymin, - pScissorsInFixedPoint[pViewportIndex[5]].ymin, - pScissorsInFixedPoint[pViewportIndex[6]].ymin, - pScissorsInFixedPoint[pViewportIndex[7]].ymin); - scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, - pScissorsInFixedPoint[pViewportIndex[1]].xmax, - pScissorsInFixedPoint[pViewportIndex[2]].xmax, - pScissorsInFixedPoint[pViewportIndex[3]].xmax, - pScissorsInFixedPoint[pViewportIndex[4]].xmax, - pScissorsInFixedPoint[pViewportIndex[5]].xmax, - pScissorsInFixedPoint[pViewportIndex[6]].xmax, - pScissorsInFixedPoint[pViewportIndex[7]].xmax); - scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, - pScissorsInFixedPoint[pViewportIndex[1]].ymax, - pScissorsInFixedPoint[pViewportIndex[2]].ymax, - pScissorsInFixedPoint[pViewportIndex[3]].ymax, - pScissorsInFixedPoint[pViewportIndex[4]].ymax, - pScissorsInFixedPoint[pViewportIndex[5]].ymax, - pScissorsInFixedPoint[pViewportIndex[6]].ymax, - pScissorsInFixedPoint[pViewportIndex[7]].ymax); - } -}; - ////////////////////////////////////////////////////////////////////////// /// @brief StreamOut - Streams vertex data out to SO buffers. /// Generally, we are only streaming out a SIMDs worth of triangles. @@ -1488,1320 +1413,4 @@ PFN_FE_WORK_FUNC GetProcessDrawFunc( bool HasRasterization) { return TemplateArgUnroller::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Processes attributes for the backend based on linkage mask and -/// linkage map. Essentially just doing an SOA->AOS conversion and pack. -/// @param pDC - Draw context -/// @param pa - Primitive Assembly state -/// @param linkageMask - Specifies which VS outputs are routed to PS. -/// @param pLinkageMap - maps VS attribute slot to PS slot -/// @param triIndex - Triangle to process attributes for -/// @param pBuffer - Output result -template -INLINE void ProcessAttributes( - DRAW_CONTEXT *pDC, - PA_STATE&pa, - uint32_t triIndex, - uint32_t primId, - float *pBuffer) -{ - static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT"); - const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; - // Conservative Rasterization requires degenerate tris to have constant attribute interpolation - LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask; - const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; - const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology; - - static const float constTable[3][4] = { - {0.0f, 0.0f, 0.0f, 0.0f}, - {0.0f, 0.0f, 0.0f, 1.0f}, - {1.0f, 1.0f, 1.0f, 1.0f} - }; - - for (uint32_t i = 0; i < backendState.numAttributes; ++i) - { - uint32_t inputSlot; - if (IsSwizzledT::value) - { - SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i]; - inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib; - - } - else - { - inputSlot = VERTEX_ATTRIB_START_SLOT + i; - } - - __m128 attrib[3]; // triangle attribs (always 4 wide) - float* pAttribStart = pBuffer; - - if (HasConstantInterpT::value || IsDegenerate::value) - { - if (_bittest(&constantInterpMask, i)) - { - uint32_t vid; - uint32_t adjustedTriIndex; - static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 }; - static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} }; - static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} }; - static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} }; - static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} }; - - switch (topo) { - case TOP_QUAD_LIST: - adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex]; - vid = quadProvokingVertex[triIndex & 1][provokingVertex]; - break; - case TOP_QUAD_STRIP: - adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex]; - vid = qstripProvokingVertex[triIndex & 1][provokingVertex]; - break; - case TOP_TRIANGLE_STRIP: - adjustedTriIndex = triIndex; - vid = (triIndex & 1) - ? tristripProvokingVertex[provokingVertex] - : provokingVertex; - break; - default: - adjustedTriIndex = triIndex; - vid = provokingVertex; - break; - } - - pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib); - - for (uint32_t i = 0; i < NumVertsT::value; ++i) - { - _mm_store_ps(pBuffer, attrib[vid]); - pBuffer += 4; - } - } - else - { - pa.AssembleSingle(inputSlot, triIndex, attrib); - - for (uint32_t i = 0; i < NumVertsT::value; ++i) - { - _mm_store_ps(pBuffer, attrib[i]); - pBuffer += 4; - } - } - } - else - { - pa.AssembleSingle(inputSlot, triIndex, attrib); - - for (uint32_t i = 0; i < NumVertsT::value; ++i) - { - _mm_store_ps(pBuffer, attrib[i]); - pBuffer += 4; - } - } - - // pad out the attrib buffer to 3 verts to ensure the triangle - // interpolation code in the pixel shader works correctly for the - // 3 topologies - point, line, tri. This effectively zeros out the - // effect of the missing vertices in the triangle interpolation. - for (uint32_t v = NumVertsT::value; v < 3; ++v) - { - _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]); - pBuffer += 4; - } - - // check for constant source overrides - if (IsSwizzledT::value) - { - uint32_t mask = backendState.swizzleMap[i].componentOverrideMask; - if (mask) - { - DWORD comp; - while (_BitScanForward(&comp, mask)) - { - mask &= ~(1 << comp); - - float constantValue = 0.0f; - switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource) - { - case SWR_CONSTANT_SOURCE_CONST_0000: - case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT: - case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT: - constantValue = constTable[backendState.swizzleMap[i].constantSource][comp]; - break; - case SWR_CONSTANT_SOURCE_PRIM_ID: - constantValue = *(float*)&primId; - break; - } - - // apply constant value to all 3 vertices - for (uint32_t v = 0; v < 3; ++v) - { - pAttribStart[comp + v * 4] = constantValue; - } - } - } - } - } -} - - -typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); - -struct ProcessAttributesChooser -{ - typedef PFN_PROCESS_ATTRIBUTES FuncType; - - template - static FuncType GetFunc() - { - return ProcessAttributes; - } -}; - -PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false) -{ - return TemplateArgUnroller::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Processes enabled user clip distances. Loads the active clip -/// distances from the PA, sets up barycentric equations, and -/// stores the results to the output buffer -/// @param pa - Primitive Assembly state -/// @param primIndex - primitive index to process -/// @param clipDistMask - mask of enabled clip distances -/// @param pUserClipBuffer - buffer to store results -template -void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer) -{ - DWORD clipDist; - while (_BitScanForward(&clipDist, clipDistMask)) - { - clipDistMask &= ~(1 << clipDist); - uint32_t clipSlot = clipDist >> 2; - uint32_t clipComp = clipDist & 0x3; - uint32_t clipAttribSlot = clipSlot == 0 ? - VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; - - __m128 primClipDist[3]; - pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); - - float vertClipDist[NumVerts]; - for (uint32_t e = 0; e < NumVerts; ++e) - { - OSALIGNSIMD(float) aVertClipDist[4]; - _mm_store_ps(aVertClipDist, primClipDist[e]); - vertClipDist[e] = aVertClipDist[clipComp]; - }; - - // setup plane equations for barycentric interpolation in the backend - float baryCoeff[NumVerts]; - for (uint32_t e = 0; e < NumVerts - 1; ++e) - { - baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1]; - } - baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1]; - - for (uint32_t e = 0; e < NumVerts; ++e) - { - *(pUserClipBuffer++) = baryCoeff[e]; - } - } -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert the X,Y coords of a triangle to the requested Fixed -/// Point precision from FP32. -template > -INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn) -{ - simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value)); - return _simd_cvtps_epi32(vFixed); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Helper function to set the X,Y coords of a triangle to the -/// requested Fixed Point precision from FP32. -/// @param tri: simdvector[3] of FP triangle verts -/// @param vXi: fixed point X coords of tri verts -/// @param vYi: fixed point Y coords of tri verts -INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3]) -{ - vXi[0] = fpToFixedPointVertical(tri[0].x); - vYi[0] = fpToFixedPointVertical(tri[0].y); - vXi[1] = fpToFixedPointVertical(tri[1].x); - vYi[1] = fpToFixedPointVertical(tri[1].y); - vXi[2] = fpToFixedPointVertical(tri[2].x); - vYi[2] = fpToFixedPointVertical(tri[2].y); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Calculate bounding box for current triangle -/// @tparam CT: ConservativeRastFETraits type -/// @param vX: fixed point X position for triangle verts -/// @param vY: fixed point Y position for triangle verts -/// @param bbox: fixed point bbox -/// *Note*: expects vX, vY to be in the correct precision for the type -/// of rasterization. This avoids unnecessary FP->fixed conversions. -template -INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox) -{ - simdscalari vMinX = vX[0]; - vMinX = _simd_min_epi32(vMinX, vX[1]); - vMinX = _simd_min_epi32(vMinX, vX[2]); - - simdscalari vMaxX = vX[0]; - vMaxX = _simd_max_epi32(vMaxX, vX[1]); - vMaxX = _simd_max_epi32(vMaxX, vX[2]); - - simdscalari vMinY = vY[0]; - vMinY = _simd_min_epi32(vMinY, vY[1]); - vMinY = _simd_min_epi32(vMinY, vY[2]); - - simdscalari vMaxY = vY[0]; - vMaxY = _simd_max_epi32(vMaxY, vY[1]); - vMaxY = _simd_max_epi32(vMaxY, vY[2]); - - bbox.xmin = vMinX; - bbox.xmax = vMaxX; - bbox.ymin = vMinY; - bbox.ymax = vMaxY; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical -/// Offsets BBox for conservative rast -template <> -INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox) -{ - // FE conservative rast traits - typedef FEConservativeRastT CT; - - simdscalari vMinX = vX[0]; - vMinX = _simd_min_epi32(vMinX, vX[1]); - vMinX = _simd_min_epi32(vMinX, vX[2]); - - simdscalari vMaxX = vX[0]; - vMaxX = _simd_max_epi32(vMaxX, vX[1]); - vMaxX = _simd_max_epi32(vMaxX, vX[2]); - - simdscalari vMinY = vY[0]; - vMinY = _simd_min_epi32(vMinY, vY[1]); - vMinY = _simd_min_epi32(vMinY, vY[2]); - - simdscalari vMaxY = vY[0]; - vMaxY = _simd_max_epi32(vMaxY, vY[1]); - vMaxY = _simd_max_epi32(vMaxY, vY[2]); - - /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization - /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. - bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping -/// culling, viewport transform, etc. -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains triangle position data for SIMDs worth of triangles. -/// @param primID - Primitive ID for each triangle. -/// @param viewportIdx - viewport array index for each triangle. -/// @tparam CT - ConservativeRastFETraits -template -void BinTriangles( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector tri[3], - uint32_t triMask, - simdscalari primID, - simdscalari viewportIdx) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinTriangles, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; - const SWR_GS_STATE& gsState = state.gsState; - MacroTileMgr *pTileMgr = pDC->pTileMgr; - - - simdscalar vRecipW0 = _simd_set1_ps(1.0f); - simdscalar vRecipW1 = _simd_set1_ps(1.0f); - simdscalar vRecipW2 = _simd_set1_ps(1.0f); - - if (feState.vpTransformDisable) - { - // RHW is passed in directly when VP transform is disabled - vRecipW0 = tri[0].v[3]; - vRecipW1 = tri[1].v[3]; - vRecipW2 = tri[2].v[3]; - } - else - { - // Perspective divide - vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); - vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); - vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); - - tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); - tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); - tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); - - tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); - tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); - tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); - - tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); - tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); - tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); - - // Viewport transform to screen space coords - if (state.gsState.emitsViewportArrayIndex) - { - viewportTransform<3>(tri, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<3>(tri, state.vpMatrices); - } - } - - // Adjust for pixel center location - simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; - tri[0].x = _simd_add_ps(tri[0].x, offset); - tri[0].y = _simd_add_ps(tri[0].y, offset); - - tri[1].x = _simd_add_ps(tri[1].x, offset); - tri[1].y = _simd_add_ps(tri[1].y, offset); - - tri[2].x = _simd_add_ps(tri[2].x, offset); - tri[2].y = _simd_add_ps(tri[2].y, offset); - - simdscalari vXi[3], vYi[3]; - // Set vXi, vYi to required fixed point precision - FPToFixedPoint(tri, vXi, vYi); - - // triangle setup - simdscalari vAi[3], vBi[3]; - triangleSetupABIntVertical(vXi, vYi, vAi, vBi); - - // determinant - simdscalari vDet[2]; - calcDeterminantIntVertical(vAi, vBi, vDet); - - // cull zero area - int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); - int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); - - int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); - - uint32_t origTriMask = triMask; - // don't cull degenerate triangles if we're conservatively rasterizing - if(!CT::IsConservativeT::value) - { - triMask &= ~cullZeroAreaMask; - } - - // determine front winding tris - // CW +det - // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast - maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); - maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); - int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) ); - - uint32_t frontWindingTris; - if (rastState.frontWinding == SWR_FRONTWINDING_CW) - { - frontWindingTris = cwTriMask; - } - else - { - frontWindingTris = ~cwTriMask; - } - - // cull - uint32_t cullTris; - switch ((SWR_CULLMODE)rastState.cullMode) - { - case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; - case SWR_CULLMODE_NONE: cullTris = 0x0; break; - case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; - // 0 area triangles are marked as backfacing, which is required behavior for conservative rast - case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; - default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; - } - - triMask &= ~cullTris; - - if (origTriMask ^ triMask) - { - RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); - } - - /// Note: these variable initializations must stay above any 'goto endBenTriangles' - // compute per tri backface - uint32_t frontFaceMask = frontWindingTris; - uint32_t *pPrimID = (uint32_t *)&primID; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - DWORD triIndex = 0; - // for center sample pattern, all samples are at pixel center; calculate coverage - // once at center and broadcast the results in the backend - const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; - uint32_t edgeEnable; - PFN_WORK_FUNC pfnWork; - if(CT::IsConservativeT::value) - { - // determine which edges of the degenerate tri, if any, are valid to rasterize. - // used to call the appropriate templated rasterizer function - if(cullZeroAreaMask > 0) - { - // e0 = v1-v0 - simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]); - simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]); - uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask))); - - // e1 = v2-v1 - simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]); - simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]); - uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask))); - - // e2 = v0-v2 - // if v0 == v1 & v1 == v2, v0 == v2 - uint32_t e2Mask = e0Mask & e1Mask; - SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512"); - - // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 - // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 - e0Mask = pdep_u32(e0Mask, 0x00249249); - // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 - e1Mask = pdep_u32(e1Mask, 0x00492492); - // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 - e2Mask = pdep_u32(e2Mask, 0x00924924); - - edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask))); - } - else - { - edgeEnable = 0x00FFFFFF; - } - } - else - { - // degenerate triangles won't be sent to rasterizer; just enable all edges - pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, - (state.scissorsTileAligned == false)); - } - - if (!triMask) - { - goto endBinTriangles; - } - - // Calc bounding box of triangles - simdBBox bbox; - calcBoundingBoxIntVertical(tri, vXi, vYi, bbox); - - // determine if triangle falls between pixel centers and discard - // only discard for non-MSAA case and when conservative rast is disabled - // (xmin + 127) & ~255 - // (xmax + 128) & ~255 - if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value)) - { - origTriMask = triMask; - - int cullCenterMask; - { - simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127)); - xmin = _simd_and_si(xmin, _simd_set1_epi32(~255)); - simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128)); - xmax = _simd_and_si(xmax, _simd_set1_epi32(~255)); - - simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax); - - simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127)); - ymin = _simd_and_si(ymin, _simd_set1_epi32(~255)); - simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128)); - ymax = _simd_and_si(ymax, _simd_set1_epi32(~255)); - - simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax); - vMaskV = _simd_or_si(vMaskH, vMaskV); - cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); - } - - triMask &= ~cullCenterMask; - - if(origTriMask ^ triMask) - { - RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); - } - } - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - // Gather the AOS effective scissor rects based on the per-prim VP index. - /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) - { - GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); - - if(CT::IsConservativeT::value) - { - // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has - // some area. Bump the xmax/ymax edges out - simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax); - bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom); - simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax); - bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight); - } - - // Cull tris completely outside scissor - { - simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); - simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); - simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); - triMask = triMask & ~maskOutsideScissor; - } - - if (!triMask) - { - goto endBinTriangles; - } - - // Convert triangle bbox to macrotile units. - bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); - _simd_store_si((simdscalari*)aMTRight, bbox.xmax); - _simd_store_si((simdscalari*)aMTTop, bbox.ymin); - _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); - - // transpose verts needed for backend - /// @todo modify BE to take non-transformed verts - __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; - vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); - vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); - vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); - vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); - - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) - { - simdvector vRtai[3]; - pa.Assemble(VERTEX_RTAI_SLOT, vRtai); - simdscalari vRtaii; - vRtaii = _simd_castps_si(vRtai[0].x); - _simd_store_si((simdscalari*)aRTAI, vRtaii); - } - else - { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); - } - - // scan remaining valid triangles and bin each separately - while (_BitScanForward(&triIndex, triMask)) - { - uint32_t linkageCount = state.backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - bool isDegenerate; - if(CT::IsConservativeT::value) - { - // only rasterize valid edges if we have a degenerate primitive - int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; - work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, - (state.scissorsTileAligned == false)); - - // Degenerate triangles are required to be constant interpolated - isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; - } - else - { - isDegenerate = false; - work.pfnWork = pfnWork; - } - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate); - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); - desc.triFlags.primID = pPrimID[triIndex]; - desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; - desc.triFlags.viewportIndex = pViewportIndex[triIndex]; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.pAttribs = pAttribs; - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); - - // store triangle vertex data - desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - - _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); - _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); - _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); - _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); - - // store user clip distances - if (rastState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); - ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); - } - - for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) - { - for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - triMask &= ~(1 << triIndex); - } - -endBinTriangles: - AR_END(FEBinTriangles, 1); -} - -struct FEBinTrianglesChooser -{ - typedef PFN_PROCESS_PRIMS FuncType; - - template - static FuncType GetFunc() - { - return BinTriangles>; - } -}; - -// Selector for correct templated BinTrinagles function -PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) -{ - return TemplateArgUnroller::GetFunc(IsConservative); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin SIMD points to the backend. Only supports point size of 1 -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains point position data for SIMDs worth of points. -/// @param primID - Primitive ID for each point. -void BinPoints( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[3], - uint32_t primMask, - simdscalari primID, - simdscalari viewportIdx) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinPoints, pDC->drawId); - - simdvector& primVerts = prim[0]; - - const API_STATE& state = GetApiState(pDC); - const SWR_FRONTEND_STATE& feState = state.frontendState; - const SWR_GS_STATE& gsState = state.gsState; - const SWR_RASTSTATE& rastState = state.rastState; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - - if (!feState.vpTransformDisable) - { - // perspective divide - simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); - primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); - primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); - primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); - - // viewport transform to screen coords - if (state.gsState.emitsViewportArrayIndex) - { - viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<1>(&primVerts, state.vpMatrices); - } - } - - // adjust for pixel center location - simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; - primVerts.x = _simd_add_ps(primVerts.x, offset); - primVerts.y = _simd_add_ps(primVerts.y, offset); - - // convert to fixed point - simdscalari vXi, vYi; - vXi = fpToFixedPointVertical(primVerts.x); - vYi = fpToFixedPointVertical(primVerts.y); - - if (CanUseSimplePoints(pDC)) - { - // adjust for ymin-xmin rule - vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); - vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); - - // cull points off the ymin-xmin edge of the viewport - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); - - // compute macro tile coordinates - simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMacroX, macroX); - _simd_store_si((simdscalari*)aMacroY, macroY); - - // compute raster tile coordinates - simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - - // compute raster tile relative x,y for coverage mask - simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); - simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); - - simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); - simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); - - OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); - _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); - - OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); - _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); - - OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; - _simd_store_ps((float*)aZ, primVerts.z); - - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) - { - simdvector vRtai; - pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai.x); - _simd_store_si((simdscalari*)aRTAI, vRtaii); - } - else - { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); - } - - uint32_t *pPrimID = (uint32_t *)&primID; - DWORD primIndex = 0; - - const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; - - // scan remaining valid triangles and bin each separately - while (_BitScanForward(&primIndex, primMask)) - { - uint32_t linkageCount = backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - // points are always front facing - desc.triFlags.frontFacing = 1; - desc.triFlags.primID = pPrimID[primIndex]; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeSimplePoint; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store attributes - float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); - desc.pAttribs = pAttribs; - desc.numAttribs = linkageCount; - - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); - - // store raster tile aligned x, y, perspective correct z - float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; - *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; - *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; - *pTriBuffer = aZ[primIndex]; - - uint32_t tX = aTileRelativeX[primIndex]; - uint32_t tY = aTileRelativeY[primIndex]; - - // pack the relative x,y into the coverageMask, the rasterizer will - // generate the true coverage mask from it - work.desc.tri.triFlags.coverageMask = tX | (tY << 4); - - // bin it - MacroTileMgr *pTileMgr = pDC->pTileMgr; -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); - } - primMask &= ~(1 << primIndex); - } - } - else - { - // non simple points need to be potentially binned to multiple macro tiles - simdscalar vPointSize; - if (rastState.pointParam) - { - simdvector size[3]; - pa.Assemble(VERTEX_POINT_SIZE_SLOT, size); - vPointSize = size[0].x; - } - else - { - vPointSize = _simd_set1_ps(rastState.pointSize); - } - - // bloat point to bbox - simdBBox bbox; - bbox.xmin = bbox.xmax = vXi; - bbox.ymin = bbox.ymax = vYi; - - simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); - simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); - bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); - bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); - bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); - bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - // Gather the AOS effective scissor rects based on the per-prim VP index. - /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) - { - GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); - - // Cull bloated points completely outside scissor - simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); - simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); - simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; - - // Convert bbox to macrotile units. - bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); - _simd_store_si((simdscalari*)aMTRight, bbox.xmax); - _simd_store_si((simdscalari*)aMTTop, bbox.ymin); - _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); - - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) - { - simdvector vRtai[2]; - pa.Assemble(VERTEX_RTAI_SLOT, vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai[0].x); - _simd_store_si((simdscalari*)aRTAI, vRtaii); - } - else - { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); - } - - OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; - _simd_store_ps((float*)aPointSize, vPointSize); - - uint32_t *pPrimID = (uint32_t *)&primID; - - OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; - - _simd_store_ps((float*)aPrimVertsX, primVerts.x); - _simd_store_ps((float*)aPrimVertsY, primVerts.y); - _simd_store_ps((float*)aPrimVertsZ, primVerts.z); - - // scan remaining valid prims and bin each separately - const SWR_BACKEND_STATE& backendState = state.backendState; - DWORD primIndex; - while (_BitScanForward(&primIndex, primMask)) - { - uint32_t linkageCount = backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - desc.triFlags.frontFacing = 1; - desc.triFlags.primID = pPrimID[primIndex]; - desc.triFlags.pointSize = aPointSize[primIndex]; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeTriPoint; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); - - // store point vertex data - float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; - *pTriBuffer++ = aPrimVertsX[primIndex]; - *pTriBuffer++ = aPrimVertsY[primIndex]; - *pTriBuffer = aPrimVertsZ[primIndex]; - - // store user clip distances - if (rastState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); - ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); - } - - MacroTileMgr *pTileMgr = pDC->pTileMgr; - for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) - { - for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - primMask &= ~(1 << primIndex); - } - } - - AR_END(FEBinPoints, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin SIMD lines to the backend. -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains line position data for SIMDs worth of points. -/// @param primID - Primitive ID for each line. -/// @param viewportIdx - Viewport Array Index for each line. -void BinLines( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[], - uint32_t primMask, - simdscalari primID, - simdscalari viewportIdx) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinLines, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; - const SWR_GS_STATE& gsState = state.gsState; - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - - simdscalar vRecipW0 = _simd_set1_ps(1.0f); - simdscalar vRecipW1 = _simd_set1_ps(1.0f); - - if (!feState.vpTransformDisable) - { - // perspective divide - vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); - vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); - - prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0); - prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1); - - prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0); - prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1); - - prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0); - prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1); - - // viewport transform to screen coords - if (state.gsState.emitsViewportArrayIndex) - { - viewportTransform<2>(prim, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<2>(prim, state.vpMatrices); - } - } - - // adjust for pixel center location - simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; - prim[0].x = _simd_add_ps(prim[0].x, offset); - prim[0].y = _simd_add_ps(prim[0].y, offset); - - prim[1].x = _simd_add_ps(prim[1].x, offset); - prim[1].y = _simd_add_ps(prim[1].y, offset); - - // convert to fixed point - simdscalari vXi[2], vYi[2]; - vXi[0] = fpToFixedPointVertical(prim[0].x); - vYi[0] = fpToFixedPointVertical(prim[0].y); - vXi[1] = fpToFixedPointVertical(prim[1].x); - vYi[1] = fpToFixedPointVertical(prim[1].y); - - // compute x-major vs y-major mask - simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); - simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); - simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); - uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); - - // cull zero-length lines - simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); - vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); - - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); - - uint32_t *pPrimID = (uint32_t *)&primID; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - - simdscalar vUnused = _simd_setzero_ps(); - - // Calc bounding box of lines - simdBBox bbox; - bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]); - bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]); - bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]); - bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]); - - // bloat bbox by line width along minor axis - simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); - simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); - simdBBox bloatBox; - bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); - bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); - bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); - bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); - - bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); - bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); - bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); - bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) - { - GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); - - // Cull prims completely outside scissor - { - simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); - simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); - simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; - } - - if (!primMask) - { - goto endBinLines; - } - - // Convert triangle bbox to macrotile units. - bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); - _simd_store_si((simdscalari*)aMTRight, bbox.xmax); - _simd_store_si((simdscalari*)aMTTop, bbox.ymin); - _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); - - // transpose verts needed for backend - /// @todo modify BE to take non-transformed verts - __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; - vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); - vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); - vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); - vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); - - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) - { - simdvector vRtai[2]; - pa.Assemble(VERTEX_RTAI_SLOT, vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai[0].x); - _simd_store_si((simdscalari*)aRTAI, vRtaii); - } - else - { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); - } - - // scan remaining valid prims and bin each separately - DWORD primIndex; - while (_BitScanForward(&primIndex, primMask)) - { - uint32_t linkageCount = state.backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - desc.triFlags.frontFacing = 1; - desc.triFlags.primID = pPrimID[primIndex]; - desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeLine; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); - - // store line vertex data - desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); - _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); - _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); - _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); - - // store user clip distances - if (rastState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); - ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); - } - - MacroTileMgr *pTileMgr = pDC->pTileMgr; - for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) - { - for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - primMask &= ~(1 << primIndex); - } - -endBinLines: - - AR_END(FEBinLines, 1); -} +} \ No newline at end of file -- 2.30.2