From: Tim Rowley Date: Thu, 18 Aug 2016 15:56:15 +0000 (-0500) Subject: swr: [rasterizer core] per-primitive viewports/scissors X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b473bec87878fd52eef8ba1ffbc9cf11dc00dc0f;p=mesa.git swr: [rasterizer core] per-primitive viewports/scissors - use per-primitive viewports throughout the pipeline. - track whether all available scissor rects are tile aligned. Causes failures, so not taken into account when choosing rasterizer yet. Signed-off-by: Tim Rowley --- diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index d53a6cbedda..5369c21250a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -727,34 +727,52 @@ void SwrSetScissorRects( void SetupMacroTileScissors(DRAW_CONTEXT *pDC) { API_STATE *pState = &pDC->pState->state; + uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; + pState->scissorsTileAligned = true; - // Set up scissor dimensions based on scissor or viewport - if (pState->rastState.scissorEnable) + for (uint32_t index = 0; index < numScissors; ++index) { - pState->scissorInFixedPoint = pState->scissorRects[0]; - } - else - { - // the vp width and height must be added to origin un-rounded then the result round to -inf. - // The cast to int works for rounding assuming all [left, right, top, bottom] are positive. - pState->scissorInFixedPoint.xmin = (int32_t)pState->vp[0].x; - pState->scissorInFixedPoint.xmax = (int32_t)(pState->vp[0].x + pState->vp[0].width); - pState->scissorInFixedPoint.ymin = (int32_t)pState->vp[0].y; - pState->scissorInFixedPoint.ymax = (int32_t)(pState->vp[0].y + pState->vp[0].height); - } + SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index]; - // Clamp to max rect - pState->scissorInFixedPoint &= g_MaxScissorRect; + // Set up scissor dimensions based on scissor or viewport + if (pState->rastState.scissorEnable) + { + scissorInFixedPoint = pState->scissorRects[index]; + } + else + { + // the vp width and height must be added to origin un-rounded then the result round to -inf. + // The cast to int works for rounding assuming all [left, right, top, bottom] are positive. + scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x; + scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width); + scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y; + scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height); + } + + // Clamp to max rect + scissorInFixedPoint &= g_MaxScissorRect; + + // Test for tile alignment + bool tileAligned; + tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; + tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0; + tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0; + tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_Y_DIM) == 0; + + pState->scissorsTileAligned &= tileAligned; - // Scale to fixed point - pState->scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; - pState->scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; - pState->scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; - pState->scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; + // Scale to fixed point + scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; + scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; + scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; + scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; - // Make scissor inclusive - pState->scissorInFixedPoint.xmax -= 1; - pState->scissorInFixedPoint.ymax -= 1; + // Make scissor inclusive + scissorInFixedPoint.xmax -= 1; + scissorInFixedPoint.ymax -= 1; + } + + } // templated backend function tables diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 1e4dca2fe25..7dd6c0db3de 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -493,14 +493,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(T::bCanEarlyZ) { RDTSC_START(BEEarlyDepthTest); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, - psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, + psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); RDTSC_STOP(BEEarlyDepthTest, 0, 0); // early-exit if no pixels passed depth or earlyZ is forced on if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) @@ -525,14 +525,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(!T::bCanEarlyZ) { RDTSC_START(BELateDepthTest); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); RDTSC_STOP(BELateDepthTest, 0, 0); if(!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); goto Endtile; } @@ -549,7 +549,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 // do final depth write after all pixel kills if (!pPSState->forceEarlyZ) { - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); } RDTSC_STOP(BEOutputMerger, 0, 0); @@ -712,14 +712,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (T::bCanEarlyZ) { RDTSC_START(BEEarlyDepthTest); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); RDTSC_STOP(BEEarlyDepthTest, 0, 0); // early-exit if no samples passed depth or earlyZ is forced on. if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) @@ -745,14 +745,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (!T::bCanEarlyZ) { RDTSC_START(BELateDepthTest); - depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); RDTSC_STOP(BELateDepthTest, 0, 0); if (!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); @@ -771,7 +771,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ // do final depth write after all pixel kills if (!pPSState->forceEarlyZ) { - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); } RDTSC_STOP(BEOutputMerger, 0, 0); @@ -984,7 +984,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample); uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample); - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); } RDTSC_STOP(BEOutputMerger, 0, 0); @@ -1093,9 +1093,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample); RDTSC_START(BEEarlyDepthTest); - simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, + simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); RDTSC_STOP(BEEarlyDepthTest, 0, 0); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 27851a1156c..fde5a3f8d9f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -491,14 +491,15 @@ struct PixelRateZTestLoop RDTSC_START(BEDepthBucket); depthPassMask[sample] = vCoverageMask[sample]; stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample, - vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]); + depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, + vZ[sample], pDepthSample, vCoverageMask[sample], + pStencilSample, &stencilPassMask[sample]); RDTSC_STOP(BEDepthBucket, 0, 0); // early-exit if no pixels passed depth or earlyZ is forced on if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) { - DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], + DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); if(!_simd_movemask_ps(depthPassMask[sample])) diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 81820530024..c311cb8cab0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -63,6 +63,7 @@ struct TRI_FLAGS float pointSize; uint32_t primID; uint32_t renderTargetArrayIndex; + uint32_t viewportIndex; }; ////////////////////////////////////////////////////////////////////////// @@ -274,7 +275,8 @@ OSALIGNLINE(struct) API_STATE SWR_VIEWPORT_MATRICES vpMatrices; SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; - SWR_RECT scissorInFixedPoint; + SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; + bool scissorsTileAligned; // Backend state SWR_BACKEND_STATE backendState; diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h index 7b55580bf0a..590c569030a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h @@ -117,14 +117,14 @@ simdscalar QuantizeDepth(simdscalar depth) INLINE simdscalar DepthStencilTest(const API_STATE* pState, - bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase, - simdscalar* pStencilMask) + bool frontFacing, uint32_t viewportIndex, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, + uint8_t *pStencilBase, simdscalar* pStencilMask) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState; - const SWR_VIEWPORT* pViewport = &pState->vp[0]; + const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex]; simdscalar depthResult = _simd_set1_ps(-1.0f); simdscalar zbuf; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 04c62adbc5a..a49ec7a9fbb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -465,6 +465,70 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) return _simd_castps_si(vMask(mask)); } + +////////////////////////////////////////////////////////////////////////// +/// @brief Gather scissor rect data based on per-prim viewport indices. +/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. +/// @param pViewportIndex - array of per-primitive vewport indexes. +/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data. +/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data. +/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data. +/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. +// +/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. +template +struct GatherScissors +{ + static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, + simdscalari &scisXmin, simdscalari &scisYmin, + simdscalari &scisXmax, simdscalari &scisYmax) + { + SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather"); + } +}; + +template<> +struct GatherScissors<8> +{ + static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, + simdscalari &scisXmin, simdscalari &scisYmin, + simdscalari &scisXmax, simdscalari &scisYmax) + { + scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, + pScissorsInFixedPoint[pViewportIndex[1]].xmin, + pScissorsInFixedPoint[pViewportIndex[2]].xmin, + pScissorsInFixedPoint[pViewportIndex[3]].xmin, + pScissorsInFixedPoint[pViewportIndex[4]].xmin, + pScissorsInFixedPoint[pViewportIndex[5]].xmin, + pScissorsInFixedPoint[pViewportIndex[6]].xmin, + pScissorsInFixedPoint[pViewportIndex[7]].xmin); + scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, + pScissorsInFixedPoint[pViewportIndex[1]].ymin, + pScissorsInFixedPoint[pViewportIndex[2]].ymin, + pScissorsInFixedPoint[pViewportIndex[3]].ymin, + pScissorsInFixedPoint[pViewportIndex[4]].ymin, + pScissorsInFixedPoint[pViewportIndex[5]].ymin, + pScissorsInFixedPoint[pViewportIndex[6]].ymin, + pScissorsInFixedPoint[pViewportIndex[7]].ymin); + scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, + pScissorsInFixedPoint[pViewportIndex[1]].xmax, + pScissorsInFixedPoint[pViewportIndex[2]].xmax, + pScissorsInFixedPoint[pViewportIndex[3]].xmax, + pScissorsInFixedPoint[pViewportIndex[4]].xmax, + pScissorsInFixedPoint[pViewportIndex[5]].xmax, + pScissorsInFixedPoint[pViewportIndex[6]].xmax, + pScissorsInFixedPoint[pViewportIndex[7]].xmax); + scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, + pScissorsInFixedPoint[pViewportIndex[1]].ymax, + pScissorsInFixedPoint[pViewportIndex[2]].ymax, + pScissorsInFixedPoint[pViewportIndex[3]].ymax, + pScissorsInFixedPoint[pViewportIndex[4]].ymax, + pScissorsInFixedPoint[pViewportIndex[5]].ymax, + pScissorsInFixedPoint[pViewportIndex[6]].ymax, + pScissorsInFixedPoint[pViewportIndex[7]].ymax); + } +}; + ////////////////////////////////////////////////////////////////////////// /// @brief StreamOut - Streams vertex data out to SO buffers. /// Generally, we are only streaming out a SIMDs worth of triangles. @@ -1849,6 +1913,7 @@ void BinTriangles( // compute per tri backface uint32_t frontFaceMask = frontWindingTris; uint32_t *pPrimID = (uint32_t *)&primID; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; DWORD triIndex = 0; // for center sample pattern, all samples are at pixel center; calculate coverage // once at center and broadcast the results in the backend @@ -1944,10 +2009,26 @@ void BinTriangles( } // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin)); - bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin)); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax)); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax)); + // Gather the AOS effective scissor rects based on the per-prim VP index. + /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); if(CT::IsConservativeT::value) { @@ -2044,7 +2125,8 @@ void BinTriangles( desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); desc.triFlags.primID = pPrimID[triIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; - + desc.triFlags.viewportIndex = pViewportIndex[triIndex]; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); @@ -2130,6 +2212,7 @@ void BinPoints( const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_GS_STATE& gsState = state.gsState; const SWR_RASTSTATE& rastState = state.rastState; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; // Select attribute processor PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, @@ -2240,6 +2323,7 @@ void BinPoints( desc.triFlags.frontFacing = 1; desc.triFlags.primID = pPrimID[primIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeSimplePoint; @@ -2306,10 +2390,26 @@ void BinPoints( bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin)); - bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin)); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax)); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax)); + // Gather the AOS effective scissor rects based on the per-prim VP index. + /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); // Cull bloated points completely outside scissor simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); @@ -2374,6 +2474,7 @@ void BinPoints( desc.triFlags.primID = pPrimID[primIndex]; desc.triFlags.pointSize = aPointSize[primIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeTriPoint; @@ -2431,6 +2532,7 @@ void BinPoints( /// @param workerId - thread's worker id. Even thread has a unique id. /// @param tri - Contains line position data for SIMDs worth of points. /// @param primID - Primitive ID for each line. +/// @param viewportIdx - Viewport Array Index for each line. void BinLines( DRAW_CONTEXT *pDC, PA_STATE& pa, @@ -2508,6 +2610,7 @@ void BinLines( primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); uint32_t *pPrimID = (uint32_t *)&primID; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; simdscalar vUnused = _simd_setzero_ps(); @@ -2533,10 +2636,24 @@ void BinLines( bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin)); - bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin)); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax)); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax)); + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); // Cull prims completely outside scissor { @@ -2602,6 +2719,7 @@ void BinLines( desc.triFlags.primID = pPrimID[primIndex]; desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeLine; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 9a8d062818d..66283e340d6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -967,20 +967,22 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, OSALIGNSIMD(SWR_RECT) bbox; calcBoundingBoxInt(vXi, vYi, bbox); + const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; + if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) { // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++; - SWR_ASSERT(state.scissorInFixedPoint.xmin >= 0 && state.scissorInFixedPoint.ymin >= 0, + SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, "Conservative rast degenerate handling requires a valid scissor rect"); } // Intersect with scissor/viewport OSALIGNSIMD(SWR_RECT) intersect; - intersect.xmin = std::max(bbox.xmin, state.scissorInFixedPoint.xmin); - intersect.xmax = std::min(bbox.xmax - 1, state.scissorInFixedPoint.xmax); - intersect.ymin = std::max(bbox.ymin, state.scissorInFixedPoint.ymin); - intersect.ymax = std::min(bbox.ymax - 1, state.scissorInFixedPoint.ymax); + intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); + intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); + intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); + intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); triDesc.triFlags = workDesc.triFlags; @@ -1087,7 +1089,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // Compute and store triangle edge data if scissor needs to rasterized ComputeScissorEdges - (bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); + (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile // used to for testing if entire raster tile is inside a triangle @@ -1573,6 +1575,8 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; + const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; + // create a copy of the triangle buffer to write our adjusted vertices to OSALIGNSIMD(float) newTriBuffer[4 * 4]; TRIANGLE_WORK_DESC newWorkDesc = workDesc; @@ -1667,13 +1671,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi calcBoundingBoxInt(vXai, vYai, bboxA); if (!(bboxA.xmin > macroBoxRight || - bboxA.xmin > state.scissorInFixedPoint.xmax || + bboxA.xmin > scissorInFixedPoint.xmax || bboxA.xmax - 1 < macroBoxLeft || - bboxA.xmax - 1 < state.scissorInFixedPoint.xmin || + bboxA.xmax - 1 < scissorInFixedPoint.xmin || bboxA.ymin > macroBoxBottom || - bboxA.ymin > state.scissorInFixedPoint.ymax || + bboxA.ymin > scissorInFixedPoint.ymax || bboxA.ymax - 1 < macroBoxTop || - bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) { + bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { // rasterize triangle pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); } @@ -1740,13 +1744,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi calcBoundingBoxInt(vXai, vYai, bboxA); if (!(bboxA.xmin > macroBoxRight || - bboxA.xmin > state.scissorInFixedPoint.xmax || + bboxA.xmin > scissorInFixedPoint.xmax || bboxA.xmax - 1 < macroBoxLeft || - bboxA.xmax - 1 < state.scissorInFixedPoint.xmin || + bboxA.xmax - 1 < scissorInFixedPoint.xmin || bboxA.ymin > macroBoxBottom || - bboxA.ymin > state.scissorInFixedPoint.ymax || + bboxA.ymin > scissorInFixedPoint.ymax || bboxA.ymax - 1 < macroBoxTop || - bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) { + bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { // rasterize triangle pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); }