uint32_t workerId,
void *pUserData)
{
- CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+ CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
MacroTileMgr *pTileMgr = pDC->pTileMgr;
- const API_STATE& state = GetApiState(pDC);
-
// queue a clear to each macro tile
- // compute macro tile bounds for the current scissor/viewport
- uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
- uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
- uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
- uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
+ // compute macro tile bounds for the specified rect
+ uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+ uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
BE_WORK work;
work.type = CLEAR;
work.pfnWork = ProcessClearBE;
- work.desc.clear = *pClear;
+ work.desc.clear = *pDesc;
- for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
+ for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
{
- for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
+ for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
{
pTileMgr->enqueue(x, y, &work);
}
void *pUserData)
{
RDTSC_START(FEProcessStoreTiles);
- STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
- const API_STATE& state = GetApiState(pDC);
+ STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
// queue a store to each macro tile
- // compute macro tile bounds for the current render target
- const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
- const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-
- uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
- uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+ // compute macro tile bounds for the specified rect
+ uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+ uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
// store tiles
BE_WORK work;
work.type = STORETILES;
work.pfnWork = ProcessStoreTileBE;
- work.desc.storeTiles = *pStore;
+ work.desc.storeTiles = *pDesc;
- for (uint32_t x = 0; x < numMacroTilesX; ++x)
+ for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
{
- for (uint32_t y = 0; y < numMacroTilesY; ++y)
+ for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
{
pTileMgr->enqueue(x, y, &work);
}
void *pUserData)
{
RDTSC_START(FEProcessInvalidateTiles);
- DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
+ DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
MacroTileMgr *pTileMgr = pDC->pTileMgr;
- SWR_RECT rect;
+ // compute macro tile bounds for the specified rect
+ uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
+ uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
+ uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
- if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
- {
- // Valid rect
- rect = pInv->rect;
- }
- else
- {
- // Use viewport dimensions
- const API_STATE& state = GetApiState(pDC);
-
- rect.left = (uint32_t)state.vp[0].x;
- rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
- rect.top = (uint32_t)state.vp[0].y;
- rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
- }
-
- // queue a store to each macro tile
- // compute macro tile bounds for the current render target
- uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
- uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-
- // Setup region assuming full tiles
- uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
- uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
-
- uint32_t macroTileEndX = rect.right / macroWidth;
- uint32_t macroTileEndY = rect.bottom / macroHeight;
-
- if (pInv->fullTilesOnly == false)
+ if (pDesc->fullTilesOnly == false)
{
// include partial tiles
- macroTileStartX = rect.left / macroWidth;
- macroTileStartY = rect.top / macroHeight;
-
- macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
- macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+ macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+ macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+ macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+ macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
}
- SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
- SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
+ SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
- macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
- macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
+ macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
+ macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
// load tiles
BE_WORK work;
work.type = DISCARDINVALIDATETILES;
work.pfnWork = ProcessDiscardInvalidateTilesBE;
- work.desc.discardInvalidateTiles = *pInv;
+ work.desc.discardInvalidateTiles = *pDesc;
- for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
+ for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
{
- for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
+ for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
{
pTileMgr->enqueue(x, y, &work);
}
return _simd_castps_si(vMask(mask));
}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Gather scissor rect data based on per-prim viewport indices.
+/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
+/// @param pViewportIndex - array of per-primitive vewport indexes.
+/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
+/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
+/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
+/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
+//
+/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
+template<size_t SimdWidth>
+struct GatherScissors
+{
+ static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
+ simdscalari &scisXmin, simdscalari &scisYmin,
+ simdscalari &scisXmax, simdscalari &scisYmax)
+ {
+ SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
+ }
+};
+
+template<>
+struct GatherScissors<8>
+{
+ static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
+ simdscalari &scisXmin, simdscalari &scisYmin,
+ simdscalari &scisXmax, simdscalari &scisYmax)
+ {
+ scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmin);
+ scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymin);
+ scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmax);
+ scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymax);
+ }
+};
+
//////////////////////////////////////////////////////////////////////////
/// @brief StreamOut - Streams vertex data out to SO buffers.
/// Generally, we are only streaming out a SIMDs worth of triangles.
{
RDTSC_START(FEStreamout);
- SWR_CONTEXT* pContext = pDC->pContext;
-
const API_STATE& state = GetApiState(pDC);
const SWR_STREAMOUT_STATE &soState = state.soState;
//////////////////////////////////////////////////////////////////////////
/// @brief Computes number of invocations. The current index represents
/// the start of the SIMD. The max index represents how much work
-/// items are remaining. If there is less then a SIMD's left of work
+/// items are remaining. If there is less then a SIMD's xmin of work
/// then return the remaining amount of work.
/// @param curIndex - The start index for the SIMD.
/// @param maxIndex - The last index for all work items.
{
RDTSC_START(FEGeometryShader);
- SWR_CONTEXT* pContext = pDC->pContext;
-
const API_STATE& state = GetApiState(pDC);
const SWR_GS_STATE* pState = &state.gsState;
vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
}
- pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+ // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
+ simdscalari vViewPortIdx;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ simdvector vpiAttrib[3];
+ gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
+
+ // OOB indices => forced to zero.
+ simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
+ vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
+
+ vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
+ }
+ else
+ {
+ vViewPortIdx = _simd_set1_epi32(0);
+ }
+
+ pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
}
}
} while (gsPa.NextPrim());
{
const API_STATE& state = GetApiState(pDC);
const SWR_TS_STATE& tsState = state.tsState;
- SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
SWR_ASSERT(gt_pTessellationThreadData);
SWR_ASSERT(pfnClipFunc);
pfnClipFunc(pDC, tessPa, workerId, prim,
- GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
+ GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
}
}
{
SWR_ASSERT(pDC->pState->pfnProcessPrims);
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
- GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
+ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
}
}
}
vMaxY = _simd_max_epi32(vMaxY, vY[1]);
vMaxY = _simd_max_epi32(vMaxY, vY[2]);
- bbox.left = vMinX;
- bbox.right = vMaxX;
- bbox.top = vMinY;
- bbox.bottom = vMaxY;
+ bbox.xmin = vMinX;
+ bbox.xmax = vMaxX;
+ bbox.ymin = vMinY;
+ bbox.ymax = vMaxY;
}
//////////////////////////////////////////////////////////////////////////
/// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
/// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
- bbox.left = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.right = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.top = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.bottom = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+ bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+ bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+ bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
+ bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
}
//////////////////////////////////////////////////////////////////////////
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param tri - Contains triangle position data for SIMDs worth of triangles.
/// @param primID - Primitive ID for each triangle.
+/// @param viewportIdx - viewport array index for each triangle.
/// @tparam CT - ConservativeRastFETraits
template <typename CT>
void BinTriangles(
uint32_t workerId,
simdvector tri[3],
uint32_t triMask,
- simdscalari primID)
+ simdscalari primID,
+ simdscalari viewportIdx)
{
RDTSC_START(FEBinTriangles);
simdscalar vRecipW1 = _simd_set1_ps(1.0f);
simdscalar vRecipW2 = _simd_set1_ps(1.0f);
- if (!feState.vpTransformDisable)
+ if (feState.vpTransformDisable)
{
- // perspective divide
+ // RHW is passed in directly when VP transform is disabled
+ vRecipW0 = tri[0].v[3];
+ vRecipW1 = tri[1].v[3];
+ vRecipW2 = tri[2].v[3];
+ }
+ else
+ {
+ // Perspective divide
vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
- // viewport transform to screen coords
- viewportTransform<3>(tri, state.vpMatrices);
+ // Viewport transform to screen space coords
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
+ }
+ else
+ {
+ viewportTransform<3>(tri, state.vpMatrices);
+ }
}
- // adjust for pixel center location
+ // Adjust for pixel center location
simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
tri[0].x = _simd_add_ps(tri[0].x, offset);
tri[0].y = _simd_add_ps(tri[0].y, offset);
// compute per tri backface
uint32_t frontFaceMask = frontWindingTris;
uint32_t *pPrimID = (uint32_t *)&primID;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
DWORD triIndex = 0;
// for center sample pattern, all samples are at pixel center; calculate coverage
// once at center and broadcast the results in the backend
// determine if triangle falls between pixel centers and discard
// only discard for non-MSAA case and when conservative rast is disabled
- // (left + 127) & ~255
- // (right + 128) & ~255
+ // (xmin + 127) & ~255
+ // (xmax + 128) & ~255
if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
{
origTriMask = triMask;
int cullCenterMask;
{
- simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
- left = _simd_and_si(left, _simd_set1_epi32(~255));
- simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
- right = _simd_and_si(right, _simd_set1_epi32(~255));
+ simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
+ xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
+ simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
+ xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
- simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
+ simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
- simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
- top = _simd_and_si(top, _simd_set1_epi32(~255));
- simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
- bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
+ simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
+ ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
+ simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
+ ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
- simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
+ simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
vMaskV = _simd_or_si(vMaskH, vMaskV);
cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
}
}
}
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
- bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
- bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
- bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
- bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
+ // Gather the AOS effective scissor rects based on the per-prim VP index.
+ /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
+
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
if(CT::IsConservativeT::value)
{
// in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
- // some area. Bump the right/bottom edges out
- simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.top, bbox.bottom);
- bbox.bottom = _simd_blendv_epi32(bbox.bottom, _simd_add_epi32(bbox.bottom, _simd_set1_epi32(1)), topEqualsBottom);
- simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.left, bbox.right);
- bbox.right = _simd_blendv_epi32(bbox.right, _simd_add_epi32(bbox.right, _simd_set1_epi32(1)), leftEqualsRight);
+ // some area. Bump the xmax/ymax edges out
+ simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
+ bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
+ simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
+ bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
}
// Cull tris completely outside scissor
{
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
triMask = triMask & ~maskOutsideScissor;
}
// Convert triangle bbox to macrotile units.
- bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.left);
- _simd_store_si((simdscalari*)aMTRight, bbox.right);
- _simd_store_si((simdscalari*)aMTTop, bbox.top);
- _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+ _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
+ _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
+ _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
+ _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
desc.triFlags.primID = pPrimID[triIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
-
+ desc.triFlags.viewportIndex = pViewportIndex[triIndex];
+
auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
uint32_t workerId,
simdvector prim[3],
uint32_t primMask,
- simdscalari primID)
+ simdscalari primID,
+ simdscalari viewportIdx)
{
RDTSC_START(FEBinPoints);
const SWR_FRONTEND_STATE& feState = state.frontendState;
const SWR_GS_STATE& gsState = state.gsState;
const SWR_RASTSTATE& rastState = state.rastState;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
// Select attribute processor
PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
// viewport transform to screen coords
- viewportTransform<1>(&primVerts, state.vpMatrices);
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
+ }
+ else
+ {
+ viewportTransform<1>(&primVerts, state.vpMatrices);
+ }
}
// adjust for pixel center location
if (CanUseSimplePoints(pDC))
{
- // adjust for top-left rule
+ // adjust for ymin-xmin rule
vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
- // cull points off the top-left edge of the viewport
+ // cull points off the ymin-xmin edge of the viewport
primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
desc.triFlags.frontFacing = 1;
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[primIndex];
work.pfnWork = RasterizeSimplePoint;
// bloat point to bbox
simdBBox bbox;
- bbox.left = bbox.right = vXi;
- bbox.top = bbox.bottom = vYi;
+ bbox.xmin = bbox.xmax = vXi;
+ bbox.ymin = bbox.ymax = vYi;
simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
- bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
- bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
- bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
- bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
+ bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
+ bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
+ bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
+ bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
+
+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
+ // Gather the AOS effective scissor rects based on the per-prim VP index.
+ /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
- bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
- bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
- bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
- bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
// Cull bloated points completely outside scissor
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
primMask = primMask & ~maskOutsideScissor;
// Convert bbox to macrotile units.
- bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.left);
- _simd_store_si((simdscalari*)aMTRight, bbox.right);
- _simd_store_si((simdscalari*)aMTTop, bbox.top);
- _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+ _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
+ _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
+ _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
+ _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
// store render target array index
OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.pointSize = aPointSize[primIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[primIndex];
work.pfnWork = RasterizeTriPoint;
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param tri - Contains line position data for SIMDs worth of points.
/// @param primID - Primitive ID for each line.
+/// @param viewportIdx - Viewport Array Index for each line.
void BinLines(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
simdvector prim[],
uint32_t primMask,
- simdscalari primID)
+ simdscalari primID,
+ simdscalari viewportIdx)
{
RDTSC_START(FEBinLines);
prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
// viewport transform to screen coords
- viewportTransform<2>(prim, state.vpMatrices);
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
+ }
+ else
+ {
+ viewportTransform<2>(prim, state.vpMatrices);
+ }
}
// adjust for pixel center location
primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
uint32_t *pPrimID = (uint32_t *)&primID;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
simdscalar vUnused = _simd_setzero_ps();
// Calc bounding box of lines
simdBBox bbox;
- bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
- bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
- bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
- bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
+ bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
+ bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
+ bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
+ bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
// bloat bbox by line width along minor axis
simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
simdBBox bloatBox;
- bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
- bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
- bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
- bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
-
- bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
- bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
- bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
- bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
- bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
- bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
- bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
- bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+ bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
+ bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
+ bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
+ bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
+
+ bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
+ bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
+ bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
+ bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
+
+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
+
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
// Cull prims completely outside scissor
{
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+ simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
+ simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
primMask = primMask & ~maskOutsideScissor;
}
// Convert triangle bbox to macrotile units.
- bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.left);
- _simd_store_si((simdscalari*)aMTRight, bbox.right);
- _simd_store_si((simdscalari*)aMTTop, bbox.top);
- _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+ _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
+ _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
+ _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
+ _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[primIndex];
work.pfnWork = RasterizeLine;