swr: [rasterizer core] per-primitive viewports/scissors
authorTim Rowley <timothy.o.rowley@intel.com>
Thu, 18 Aug 2016 15:56:15 +0000 (10:56 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Mon, 29 Aug 2016 17:41:16 +0000 (12:41 -0500)
- use per-primitive viewports throughout the pipeline.
- track whether all available scissor rects are tile aligned.
  Causes failures, so not taken into account when choosing rasterizer yet.

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/depthstencil.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp

index d53a6cbeddaae032e9eda7c34a03c6b0ba514708..5369c21250a406e0b6c92d3c455f9c1d6719a4c3 100644 (file)
@@ -727,34 +727,52 @@ void SwrSetScissorRects(
 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
 {
     API_STATE *pState = &pDC->pState->state;
+    uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    pState->scissorsTileAligned = true;
 
-    // Set up scissor dimensions based on scissor or viewport
-    if (pState->rastState.scissorEnable)
+    for (uint32_t index = 0; index < numScissors; ++index)
     {
-        pState->scissorInFixedPoint = pState->scissorRects[0];
-    }
-    else
-    {
-        // the vp width and height must be added to origin un-rounded then the result round to -inf.
-        // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
-        pState->scissorInFixedPoint.xmin = (int32_t)pState->vp[0].x;
-        pState->scissorInFixedPoint.xmax = (int32_t)(pState->vp[0].x + pState->vp[0].width);
-        pState->scissorInFixedPoint.ymin = (int32_t)pState->vp[0].y;
-        pState->scissorInFixedPoint.ymax = (int32_t)(pState->vp[0].y + pState->vp[0].height);
-    }
+        SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index];
 
-    // Clamp to max rect
-    pState->scissorInFixedPoint &= g_MaxScissorRect;
+        // Set up scissor dimensions based on scissor or viewport
+        if (pState->rastState.scissorEnable)
+        {
+            scissorInFixedPoint = pState->scissorRects[index];
+        }
+        else
+        {
+            // the vp width and height must be added to origin un-rounded then the result round to -inf.
+            // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
+            scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
+            scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
+            scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
+            scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
+        }
+
+        // Clamp to max rect
+        scissorInFixedPoint &= g_MaxScissorRect;
+
+        // Test for tile alignment
+        bool tileAligned;
+        tileAligned  = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
+        tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
+        tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
+        tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_Y_DIM) == 0;
+
+        pState->scissorsTileAligned &= tileAligned;
 
-    // Scale to fixed point
-    pState->scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
-    pState->scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
-    pState->scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
-    pState->scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
+        // Scale to fixed point
+        scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
+        scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
+        scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
+        scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
 
-    // Make scissor inclusive
-    pState->scissorInFixedPoint.xmax -= 1;
-    pState->scissorInFixedPoint.ymax -= 1;
+        // Make scissor inclusive
+        scissorInFixedPoint.xmax -= 1;
+        scissorInFixedPoint.ymax -= 1;
+    }
+
+    
 }
 
 // templated backend function tables
index 1e4dca2fe25418c5347ceaa2e380162f387af949..7dd6c0db3de2d0d16b95c5b55206607565312d99 100644 (file)
@@ -493,14 +493,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 if(T::bCanEarlyZ)
                 {
                     RDTSC_START(BEEarlyDepthTest);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
-                                                        psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                     psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
                     // early-exit if no pixels passed depth or earlyZ is forced on
                     if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                     {
-                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                             pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
 
                         if (!_simd_movemask_ps(depthPassMask))
@@ -525,14 +525,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 if(!T::bCanEarlyZ)
                 {
                     RDTSC_START(BELateDepthTest);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BELateDepthTest, 0, 0);
 
                     if(!_simd_movemask_ps(depthPassMask))
                     {
                         // need to call depth/stencil write for stencil write
-                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                             pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
                         goto Endtile;
                     }
@@ -549,7 +549,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // do final depth write after all pixel kills
                 if (!pPSState->forceEarlyZ)
                 {
-                    DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
                 }
                 RDTSC_STOP(BEOutputMerger, 0, 0);
@@ -712,14 +712,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     if (T::bCanEarlyZ)
                     {
                         RDTSC_START(BEEarlyDepthTest);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
                         // early-exit if no samples passed depth or earlyZ is forced on.
                         if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                         {
-                            DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
 
                             if (!_simd_movemask_ps(depthPassMask))
@@ -745,14 +745,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     if (!T::bCanEarlyZ)
                     {
                         RDTSC_START(BELateDepthTest);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BELateDepthTest, 0, 0);
 
                         if (!_simd_movemask_ps(depthPassMask))
                         {
                             // need to call depth/stencil write for stencil write
-                            DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                 pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
 
                             work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
@@ -771,7 +771,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     // do final depth write after all pixel kills
                     if (!pPSState->forceEarlyZ)
                     {
-                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                     }
                     RDTSC_STOP(BEOutputMerger, 0, 0);
@@ -984,7 +984,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
                     uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
-                    DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
                 }
                 RDTSC_STOP(BEOutputMerger, 0, 0);        
@@ -1093,9 +1093,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
                     RDTSC_START(BEEarlyDepthTest);
-                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                    DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                     RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
index 27851a1156c3fee84c47f162d9039d2a2fa1a72a..fde5a3f8d9fa326bf2ba126fc4109cd16b40f8ec 100644 (file)
@@ -491,14 +491,15 @@ struct PixelRateZTestLoop
             RDTSC_START(BEDepthBucket);
             depthPassMask[sample] = vCoverageMask[sample];
             stencilPassMask[sample] = vCoverageMask[sample];
-            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample, 
-                                                     vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
+            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                     vZ[sample], pDepthSample, vCoverageMask[sample], 
+                                                     pStencilSample, &stencilPassMask[sample]);
             RDTSC_STOP(BEDepthBucket, 0, 0);
 
             // early-exit if no pixels passed depth or earlyZ is forced on
             if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
             {
-                DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
+                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
                                   pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
 
                 if(!_simd_movemask_ps(depthPassMask[sample]))
index 818205300246d8898f2e0e7da73327e14ff742e4..c311cb8cab0e52f2a16c1931984e3441d69eab8b 100644 (file)
@@ -63,6 +63,7 @@ struct TRI_FLAGS
     float pointSize;
     uint32_t primID;
     uint32_t renderTargetArrayIndex;
+    uint32_t viewportIndex;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -274,7 +275,8 @@ OSALIGNLINE(struct) API_STATE
     SWR_VIEWPORT_MATRICES   vpMatrices;
 
     SWR_RECT                scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
-    SWR_RECT                scissorInFixedPoint;
+    SWR_RECT                scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
+    bool                    scissorsTileAligned;
 
     // Backend state
     SWR_BACKEND_STATE       backendState;
index 7b55580bf0ae689e7074945da1f3485b740e1acf..590c569030a73b897699890047ab3e21964fc54e 100644 (file)
@@ -117,14 +117,14 @@ simdscalar QuantizeDepth(simdscalar depth)
 
 INLINE
 simdscalar DepthStencilTest(const API_STATE* pState,
-                 bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
-                 simdscalar* pStencilMask)
+                 bool frontFacing, uint32_t viewportIndex, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask,
+                 uint8_t *pStencilBase, simdscalar* pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
     static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
 
     const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
-    const SWR_VIEWPORT* pViewport = &pState->vp[0];
+    const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex];
 
     simdscalar depthResult = _simd_set1_ps(-1.0f);
     simdscalar zbuf;
index 04c62adbc5a2afd2eafaa4e1f06282749c66d2f0..a49ec7a9fbb2f2f7e57dc32664ccffa2b7c20a6a 100644 (file)
@@ -465,6 +465,70 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
     return _simd_castps_si(vMask(mask));
 }
 
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief  Gather scissor rect data based on per-prim viewport indices.
+/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
+/// @param pViewportIndex - array of per-primitive vewport indexes.
+/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
+/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
+/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
+/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
+//
+/// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
+template<size_t SimdWidth>
+struct GatherScissors
+{
+    static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
+        simdscalari &scisXmin, simdscalari &scisYmin,
+        simdscalari &scisXmax, simdscalari &scisYmax)
+    {
+        SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
+    }
+};
+
+template<>
+struct GatherScissors<8>
+{
+    static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
+        simdscalari &scisXmin, simdscalari &scisYmin,
+        simdscalari &scisXmax, simdscalari &scisYmax)
+    {
+        scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
+            pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+            pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+            pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+            pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+            pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+            pScissorsInFixedPoint[pViewportIndex[6]].xmin,
+            pScissorsInFixedPoint[pViewportIndex[7]].xmin);
+        scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
+            pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+            pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+            pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+            pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+            pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+            pScissorsInFixedPoint[pViewportIndex[6]].ymin,
+            pScissorsInFixedPoint[pViewportIndex[7]].ymin);
+        scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
+            pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+            pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+            pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+            pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+            pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+            pScissorsInFixedPoint[pViewportIndex[6]].xmax,
+            pScissorsInFixedPoint[pViewportIndex[7]].xmax);
+        scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
+            pScissorsInFixedPoint[pViewportIndex[1]].ymax,
+            pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+            pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+            pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+            pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+            pScissorsInFixedPoint[pViewportIndex[6]].ymax,
+            pScissorsInFixedPoint[pViewportIndex[7]].ymax);
+    }
+};
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief StreamOut - Streams vertex data out to SO buffers.
 ///        Generally, we are only streaming out a SIMDs worth of triangles.
@@ -1849,6 +1913,7 @@ void BinTriangles(
     // compute per tri backface
     uint32_t frontFaceMask = frontWindingTris;
     uint32_t *pPrimID = (uint32_t *)&primID;
+    const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
     DWORD triIndex = 0;
     // for center sample pattern, all samples are at pixel center; calculate coverage
     // once at center and broadcast the results in the backend
@@ -1944,10 +2009,26 @@ void BinTriangles(
     }
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    bbox.xmin   = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
-    bbox.ymin    = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
-    bbox.xmax  = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
-    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
+    // Gather the AOS effective scissor rects based on the per-prim VP index.
+    /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
+    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+    if (state.gsState.emitsViewportArrayIndex)
+    {
+        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                                                scisXmin, scisYmin, scisXmax, scisYmax);
+    }
+    else // broadcast fast path for non-VPAI case.
+    {
+        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+    }
+
+    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
 
     if(CT::IsConservativeT::value)
     {
@@ -2044,7 +2125,8 @@ void BinTriangles(
         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
         desc.triFlags.primID = pPrimID[triIndex];
         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
-        
+        desc.triFlags.viewportIndex = pViewportIndex[triIndex];
+
         auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
@@ -2130,6 +2212,7 @@ void BinPoints(
     const SWR_FRONTEND_STATE& feState = state.frontendState;
     const SWR_GS_STATE& gsState = state.gsState;
     const SWR_RASTSTATE& rastState = state.rastState;
+    const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 
     // Select attribute processor
     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
@@ -2240,6 +2323,7 @@ void BinPoints(
             desc.triFlags.frontFacing = 1;
             desc.triFlags.primID = pPrimID[primIndex];
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+            desc.triFlags.viewportIndex = pViewportIndex[primIndex];
 
             work.pfnWork = RasterizeSimplePoint;
 
@@ -2306,10 +2390,26 @@ void BinPoints(
         bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
 
         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-        bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
-        bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
-        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
-        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
+        // Gather the AOS effective scissor rects based on the per-prim VP index.
+        /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.gsState.emitsViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
+
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
 
         // Cull bloated points completely outside scissor
         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -2374,6 +2474,7 @@ void BinPoints(
             desc.triFlags.primID = pPrimID[primIndex];
             desc.triFlags.pointSize = aPointSize[primIndex];
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+            desc.triFlags.viewportIndex = pViewportIndex[primIndex];
 
             work.pfnWork = RasterizeTriPoint;
 
@@ -2431,6 +2532,7 @@ void BinPoints(
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param tri - Contains line position data for SIMDs worth of points.
 /// @param primID - Primitive ID for each line.
+/// @param viewportIdx - Viewport Array Index for each line.
 void BinLines(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
@@ -2508,6 +2610,7 @@ void BinLines(
     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
 
     uint32_t *pPrimID = (uint32_t *)&primID;
+    const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 
     simdscalar vUnused = _simd_setzero_ps();
 
@@ -2533,10 +2636,24 @@ void BinLines(
     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
-    bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
-    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
-    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
+    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+    if (state.gsState.emitsViewportArrayIndex)
+    {
+        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+            scisXmin, scisYmin, scisXmax, scisYmax);
+    }
+    else // broadcast fast path for non-VPAI case.
+    {
+        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+    }
+
+    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
 
     // Cull prims completely outside scissor
     {
@@ -2602,6 +2719,7 @@ void BinLines(
         desc.triFlags.primID = pPrimID[primIndex];
         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+        desc.triFlags.viewportIndex = pViewportIndex[primIndex];
 
         work.pfnWork = RasterizeLine;
 
index 9a8d062818d65c8e83878a44ae1e88e25221fd62..66283e340d6a3f74f01778a05dced7c1e3dc37c1 100644 (file)
@@ -967,20 +967,22 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     OSALIGNSIMD(SWR_RECT) bbox;
     calcBoundingBoxInt(vXi, vYi, bbox);
 
+    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+
     if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
     {
         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
         bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
-        SWR_ASSERT(state.scissorInFixedPoint.xmin >= 0 && state.scissorInFixedPoint.ymin >= 0, 
+        SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
                    "Conservative rast degenerate handling requires a valid scissor rect");
     }
 
     // Intersect with scissor/viewport
     OSALIGNSIMD(SWR_RECT) intersect;
-    intersect.xmin = std::max(bbox.xmin, state.scissorInFixedPoint.xmin);
-    intersect.xmax = std::min(bbox.xmax - 1, state.scissorInFixedPoint.xmax);
-    intersect.ymin = std::max(bbox.ymin, state.scissorInFixedPoint.ymin);
-    intersect.ymax = std::min(bbox.ymax - 1, state.scissorInFixedPoint.ymax);
+    intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
+    intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
+    intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
+    intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
 
     triDesc.triFlags = workDesc.triFlags;
 
@@ -1087,7 +1089,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     // Compute and store triangle edge data if scissor needs to rasterized
     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
-                       (bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
+                       (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
 
     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
     // used to for testing if entire raster tile is inside a triangle
@@ -1573,6 +1575,8 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
 
+    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+
     // create a copy of the triangle buffer to write our adjusted vertices to
     OSALIGNSIMD(float) newTriBuffer[4 * 4];
     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
@@ -1667,13 +1671,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
     if (!(bboxA.xmin > macroBoxRight ||
-          bboxA.xmin > state.scissorInFixedPoint.xmax ||
+          bboxA.xmin > scissorInFixedPoint.xmax ||
           bboxA.xmax - 1 < macroBoxLeft ||
-          bboxA.xmax - 1 < state.scissorInFixedPoint.xmin ||
+          bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
           bboxA.ymin > macroBoxBottom ||
-          bboxA.ymin > state.scissorInFixedPoint.ymax ||
+          bboxA.ymin > scissorInFixedPoint.ymax ||
           bboxA.ymax - 1 < macroBoxTop ||
-          bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) {
+          bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
         // rasterize triangle
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }
@@ -1740,13 +1744,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
     if (!(bboxA.xmin > macroBoxRight ||
-          bboxA.xmin > state.scissorInFixedPoint.xmax ||
+          bboxA.xmin > scissorInFixedPoint.xmax ||
           bboxA.xmax - 1 < macroBoxLeft ||
-          bboxA.xmax - 1 < state.scissorInFixedPoint.xmin ||
+          bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
           bboxA.ymin > macroBoxBottom ||
-          bboxA.ymin > state.scissorInFixedPoint.ymax ||
+          bboxA.ymin > scissorInFixedPoint.ymax ||
           bboxA.ymax - 1 < macroBoxTop ||
-          bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) {
+          bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
         // rasterize triangle
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }