- use per-primitive viewports throughout the pipeline.
- track whether all available scissor rects are tile aligned.
Causes failures, so not taken into account when choosing rasterizer yet.
Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
{
API_STATE *pState = &pDC->pState->state;
+ uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+ pState->scissorsTileAligned = true;
- // Set up scissor dimensions based on scissor or viewport
- if (pState->rastState.scissorEnable)
+ for (uint32_t index = 0; index < numScissors; ++index)
{
- pState->scissorInFixedPoint = pState->scissorRects[0];
- }
- else
- {
- // the vp width and height must be added to origin un-rounded then the result round to -inf.
- // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
- pState->scissorInFixedPoint.xmin = (int32_t)pState->vp[0].x;
- pState->scissorInFixedPoint.xmax = (int32_t)(pState->vp[0].x + pState->vp[0].width);
- pState->scissorInFixedPoint.ymin = (int32_t)pState->vp[0].y;
- pState->scissorInFixedPoint.ymax = (int32_t)(pState->vp[0].y + pState->vp[0].height);
- }
+ SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index];
- // Clamp to max rect
- pState->scissorInFixedPoint &= g_MaxScissorRect;
+ // Set up scissor dimensions based on scissor or viewport
+ if (pState->rastState.scissorEnable)
+ {
+ scissorInFixedPoint = pState->scissorRects[index];
+ }
+ else
+ {
+ // the vp width and height must be added to origin un-rounded then the result round to -inf.
+ // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
+ scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
+ scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
+ scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
+ scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
+ }
+
+ // Clamp to max rect
+ scissorInFixedPoint &= g_MaxScissorRect;
+
+ // Test for tile alignment
+ bool tileAligned;
+ tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
+ tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
+ tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
+ tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_Y_DIM) == 0;
+
+ pState->scissorsTileAligned &= tileAligned;
- // Scale to fixed point
- pState->scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
- pState->scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
- pState->scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
- pState->scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
+ // Scale to fixed point
+ scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
+ scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
+ scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
+ scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
- // Make scissor inclusive
- pState->scissorInFixedPoint.xmax -= 1;
- pState->scissorInFixedPoint.ymax -= 1;
+ // Make scissor inclusive
+ scissorInFixedPoint.xmax -= 1;
+ scissorInFixedPoint.ymax -= 1;
+ }
+
+
}
// templated backend function tables
if(T::bCanEarlyZ)
{
RDTSC_START(BEEarlyDepthTest);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
- psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+ psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);
// early-exit if no pixels passed depth or earlyZ is forced on
if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
if(!T::bCanEarlyZ)
{
RDTSC_START(BELateDepthTest);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
RDTSC_STOP(BELateDepthTest, 0, 0);
if(!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
goto Endtile;
}
// do final depth write after all pixel kills
if (!pPSState->forceEarlyZ)
{
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
}
RDTSC_STOP(BEOutputMerger, 0, 0);
if (T::bCanEarlyZ)
{
RDTSC_START(BEEarlyDepthTest);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);
// early-exit if no samples passed depth or earlyZ is forced on.
if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
if (!_simd_movemask_ps(depthPassMask))
if (!T::bCanEarlyZ)
{
RDTSC_START(BELateDepthTest);
- depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
RDTSC_STOP(BELateDepthTest, 0, 0);
if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
// do final depth write after all pixel kills
if (!pPSState->forceEarlyZ)
{
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
}
RDTSC_STOP(BEOutputMerger, 0, 0);
uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
}
RDTSC_STOP(BEOutputMerger, 0, 0);
uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
RDTSC_START(BEEarlyDepthTest);
- simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
+ simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);
RDTSC_START(BEDepthBucket);
depthPassMask[sample] = vCoverageMask[sample];
stencilPassMask[sample] = vCoverageMask[sample];
- depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample,
- vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
+ depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+ vZ[sample], pDepthSample, vCoverageMask[sample],
+ pStencilSample, &stencilPassMask[sample]);
RDTSC_STOP(BEDepthBucket, 0, 0);
// early-exit if no pixels passed depth or earlyZ is forced on
if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
{
- DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
+ DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
if(!_simd_movemask_ps(depthPassMask[sample]))
float pointSize;
uint32_t primID;
uint32_t renderTargetArrayIndex;
+ uint32_t viewportIndex;
};
//////////////////////////////////////////////////////////////////////////
SWR_VIEWPORT_MATRICES vpMatrices;
SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
- SWR_RECT scissorInFixedPoint;
+ SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
+ bool scissorsTileAligned;
// Backend state
SWR_BACKEND_STATE backendState;
INLINE
simdscalar DepthStencilTest(const API_STATE* pState,
- bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
- simdscalar* pStencilMask)
+ bool frontFacing, uint32_t viewportIndex, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask,
+ uint8_t *pStencilBase, simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
- const SWR_VIEWPORT* pViewport = &pState->vp[0];
+ const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex];
simdscalar depthResult = _simd_set1_ps(-1.0f);
simdscalar zbuf;
return _simd_castps_si(vMask(mask));
}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Gather scissor rect data based on per-prim viewport indices.
+/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
+/// @param pViewportIndex - array of per-primitive vewport indexes.
+/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
+/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
+/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
+/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
+//
+/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
+template<size_t SimdWidth>
+struct GatherScissors
+{
+ static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
+ simdscalari &scisXmin, simdscalari &scisYmin,
+ simdscalari &scisXmax, simdscalari &scisYmax)
+ {
+ SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
+ }
+};
+
+template<>
+struct GatherScissors<8>
+{
+ static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
+ simdscalari &scisXmin, simdscalari &scisYmin,
+ simdscalari &scisXmax, simdscalari &scisYmax)
+ {
+ scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmin);
+ scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymin);
+ scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmax);
+ scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymax);
+ }
+};
+
//////////////////////////////////////////////////////////////////////////
/// @brief StreamOut - Streams vertex data out to SO buffers.
/// Generally, we are only streaming out a SIMDs worth of triangles.
// compute per tri backface
uint32_t frontFaceMask = frontWindingTris;
uint32_t *pPrimID = (uint32_t *)&primID;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
DWORD triIndex = 0;
// for center sample pattern, all samples are at pixel center; calculate coverage
// once at center and broadcast the results in the backend
}
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
- bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
+ // Gather the AOS effective scissor rects based on the per-prim VP index.
+ /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
+
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
if(CT::IsConservativeT::value)
{
desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
desc.triFlags.primID = pPrimID[triIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
-
+ desc.triFlags.viewportIndex = pViewportIndex[triIndex];
+
auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
const SWR_FRONTEND_STATE& feState = state.frontendState;
const SWR_GS_STATE& gsState = state.gsState;
const SWR_RASTSTATE& rastState = state.rastState;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
// Select attribute processor
PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
desc.triFlags.frontFacing = 1;
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[primIndex];
work.pfnWork = RasterizeSimplePoint;
bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
- bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
+ // Gather the AOS effective scissor rects based on the per-prim VP index.
+ /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
+
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
// Cull bloated points completely outside scissor
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.pointSize = aPointSize[primIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[primIndex];
work.pfnWork = RasterizeTriPoint;
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param tri - Contains line position data for SIMDs worth of points.
/// @param primID - Primitive ID for each line.
+/// @param viewportIdx - Viewport Array Index for each line.
void BinLines(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
uint32_t *pPrimID = (uint32_t *)&primID;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
simdscalar vUnused = _simd_setzero_ps();
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
- bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
+
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
// Cull prims completely outside scissor
{
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[primIndex];
work.pfnWork = RasterizeLine;
OSALIGNSIMD(SWR_RECT) bbox;
calcBoundingBoxInt(vXi, vYi, bbox);
+ const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+
if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
{
// If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
- SWR_ASSERT(state.scissorInFixedPoint.xmin >= 0 && state.scissorInFixedPoint.ymin >= 0,
+ SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
"Conservative rast degenerate handling requires a valid scissor rect");
}
// Intersect with scissor/viewport
OSALIGNSIMD(SWR_RECT) intersect;
- intersect.xmin = std::max(bbox.xmin, state.scissorInFixedPoint.xmin);
- intersect.xmax = std::min(bbox.xmax - 1, state.scissorInFixedPoint.xmax);
- intersect.ymin = std::max(bbox.ymin, state.scissorInFixedPoint.ymin);
- intersect.ymax = std::min(bbox.ymax - 1, state.scissorInFixedPoint.ymax);
+ intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
+ intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
+ intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
+ intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
triDesc.triFlags = workDesc.triFlags;
// Compute and store triangle edge data if scissor needs to rasterized
ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
- (bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
+ (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
// Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
// used to for testing if entire raster tile is inside a triangle
int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+ const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+
// create a copy of the triangle buffer to write our adjusted vertices to
OSALIGNSIMD(float) newTriBuffer[4 * 4];
TRIANGLE_WORK_DESC newWorkDesc = workDesc;
calcBoundingBoxInt(vXai, vYai, bboxA);
if (!(bboxA.xmin > macroBoxRight ||
- bboxA.xmin > state.scissorInFixedPoint.xmax ||
+ bboxA.xmin > scissorInFixedPoint.xmax ||
bboxA.xmax - 1 < macroBoxLeft ||
- bboxA.xmax - 1 < state.scissorInFixedPoint.xmin ||
+ bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
bboxA.ymin > macroBoxBottom ||
- bboxA.ymin > state.scissorInFixedPoint.ymax ||
+ bboxA.ymin > scissorInFixedPoint.ymax ||
bboxA.ymax - 1 < macroBoxTop ||
- bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) {
+ bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}
calcBoundingBoxInt(vXai, vYai, bboxA);
if (!(bboxA.xmin > macroBoxRight ||
- bboxA.xmin > state.scissorInFixedPoint.xmax ||
+ bboxA.xmin > scissorInFixedPoint.xmax ||
bboxA.xmax - 1 < macroBoxLeft ||
- bboxA.xmax - 1 < state.scissorInFixedPoint.xmin ||
+ bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
bboxA.ymin > macroBoxBottom ||
- bboxA.ymin > state.scissorInFixedPoint.ymax ||
+ bboxA.ymin > scissorInFixedPoint.ymax ||
bboxA.ymax - 1 < macroBoxTop ||
- bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) {
+ bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}