#include "backend.h"
#include "context.h"
#include "rdtsc_core.h"
-#include "rasterizer.h"
-#include "conservativeRast.h"
#include "utils.h"
#include "threads.h"
#include "pa.h"
return ((1U << numBits) - 1);
}
-//////////////////////////////////////////////////////////////////////////
-/// @brief Offsets added to post-viewport vertex positions based on
-/// raster state.
-static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
-{
- _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
- _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
-};
-
//////////////////////////////////////////////////////////////////////////
/// @brief FE handler for SwrSync.
/// @param pContext - pointer to SWR context.
// store tiles
BE_WORK work;
work.type = STORETILES;
- work.pfnWork = ProcessStoreTileBE;
+ work.pfnWork = ProcessStoreTilesBE;
work.desc.storeTiles = *pDesc;
for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
case TOP_TRI_STRIP_REVERSE:
case TOP_PATCHLIST_BASE:
case TOP_UNKNOWN:
- SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ SWR_INVALID("Unsupported topology: %d", mode);
return 0;
}
case TOP_TRI_STRIP_REVERSE:
case TOP_PATCHLIST_BASE:
case TOP_UNKNOWN:
- SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ SWR_INVALID("Unsupported topology: %d", mode);
return 0;
}
numVerts = topology - TOP_PATCHLIST_BASE;
break;
default:
- SWR_ASSERT(false, "Unsupported topology: %d", topology);
+ SWR_INVALID("Unsupported topology: %d", topology);
break;
}
return _simd_castps_si(vMask(mask));
}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Gather scissor rect data based on per-prim viewport indices.
-/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
-/// @param pViewportIndex - array of per-primitive vewport indexes.
-/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
-/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
-/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
-/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
-//
-/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
-template<size_t SimdWidth>
-struct GatherScissors
-{
- static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
- simdscalari &scisXmin, simdscalari &scisYmin,
- simdscalari &scisXmax, simdscalari &scisYmax)
- {
- SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
- }
-};
-
-template<>
-struct GatherScissors<8>
-{
- static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
- simdscalari &scisXmin, simdscalari &scisYmin,
- simdscalari &scisXmax, simdscalari &scisYmax)
- {
- scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
- pScissorsInFixedPoint[pViewportIndex[1]].xmin,
- pScissorsInFixedPoint[pViewportIndex[2]].xmin,
- pScissorsInFixedPoint[pViewportIndex[3]].xmin,
- pScissorsInFixedPoint[pViewportIndex[4]].xmin,
- pScissorsInFixedPoint[pViewportIndex[5]].xmin,
- pScissorsInFixedPoint[pViewportIndex[6]].xmin,
- pScissorsInFixedPoint[pViewportIndex[7]].xmin);
- scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
- pScissorsInFixedPoint[pViewportIndex[1]].ymin,
- pScissorsInFixedPoint[pViewportIndex[2]].ymin,
- pScissorsInFixedPoint[pViewportIndex[3]].ymin,
- pScissorsInFixedPoint[pViewportIndex[4]].ymin,
- pScissorsInFixedPoint[pViewportIndex[5]].ymin,
- pScissorsInFixedPoint[pViewportIndex[6]].ymin,
- pScissorsInFixedPoint[pViewportIndex[7]].ymin);
- scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
- pScissorsInFixedPoint[pViewportIndex[1]].xmax,
- pScissorsInFixedPoint[pViewportIndex[2]].xmax,
- pScissorsInFixedPoint[pViewportIndex[3]].xmax,
- pScissorsInFixedPoint[pViewportIndex[4]].xmax,
- pScissorsInFixedPoint[pViewportIndex[5]].xmax,
- pScissorsInFixedPoint[pViewportIndex[6]].xmax,
- pScissorsInFixedPoint[pViewportIndex[7]].xmax);
- scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
- pScissorsInFixedPoint[pViewportIndex[1]].ymax,
- pScissorsInFixedPoint[pViewportIndex[2]].ymax,
- pScissorsInFixedPoint[pViewportIndex[3]].ymax,
- pScissorsInFixedPoint[pViewportIndex[4]].ymax,
- pScissorsInFixedPoint[pViewportIndex[5]].ymax,
- pScissorsInFixedPoint[pViewportIndex[6]].ymax,
- pScissorsInFixedPoint[pViewportIndex[7]].ymax);
- }
-};
-
//////////////////////////////////////////////////////////////////////////
/// @brief StreamOut - Streams vertex data out to SO buffers.
/// Generally, we are only streaming out a SIMDs worth of triangles.
PA_STATE& pa,
uint32_t workerId,
uint32_t* pPrimData,
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims_simd8,
+#endif
uint32_t streamIndex)
{
SWR_CONTEXT *pContext = pDC->pContext;
uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
// The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
- uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
+ uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
SWR_STREAMOUT_CONTEXT soContext = { 0 };
soContext.pBuffer[i] = &state.soBuffer[i];
}
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims = numPrims_simd8;
+#else
uint32_t numPrims = pa.NumPrims();
+#endif
+
for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
{
DWORD slot = 0;
_mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
}
+
soMask &= ~(1 << slot);
}
AR_END(FEStreamout, 1);
}
+#if USE_SIMD16_FRONTEND
+//////////////////////////////////////////////////////////////////////////
+/// Is value an even number (a multiple of two)
+///
+template <typename T>
+INLINE static bool IsEven(T value)
+{
+ return (value & 1) == 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round up value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundUpEven(T value)
+{
+ return (value + 1) & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round down value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundDownEven(T value)
+{
+ return value & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
+///
+/// vertexCount is in terms of the source simdvertexes and must be even
+///
+/// attribCount will limit the vector copies to those attribs specified
+///
+/// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
+///
+void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
+{
+ SWR_ASSERT(vertex);
+ SWR_ASSERT(vertex_simd16);
+ SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
+
+ simd16vertex temp;
+
+ for (uint32_t i = 0; i < vertexCount; i += 2)
+ {
+ for (uint32_t j = 0; j < attribCount; j += 1)
+ {
+ for (uint32_t k = 0; k < 4; k += 1)
+ {
+ temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
+
+ if ((i + 1) < vertexCount)
+ {
+ temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
+ }
+ }
+ }
+
+ for (uint32_t j = 0; j < attribCount; j += 1)
+ {
+ vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
+ }
+ }
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Computes number of invocations. The current index represents
/// the start of the SIMD. The max index represents how much work
uint32_t maxIndex)
{
uint32_t remainder = (maxIndex - curIndex);
+#if USE_SIMD16_FRONTEND
+ return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
+#else
return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
+#endif
}
//////////////////////////////////////////////////////////////////////////
}
curInputByte >>= 2;
}
-
+
*pCutBuffer++ = outByte;
}
}
THREAD SWR_GS_CONTEXT tlsGsContext;
+template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
+struct GsBufferInfo
+{
+ GsBufferInfo(const SWR_GS_STATE &gsState)
+ {
+ const uint32_t vertexCount = gsState.maxNumVerts;
+ const uint32_t vertexStride = sizeof(SIMDVERTEX);
+ const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
+
+ vertexPrimitiveStride = vertexStride * numSimdBatches;
+ vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+
+ if (gsState.isSingleStream)
+ {
+ cutPrimitiveStride = (vertexCount + 7) / 8;
+ cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+
+ streamCutPrimitiveStride = 0;
+ streamCutInstanceStride = 0;
+ }
+ else
+ {
+ cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
+ cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+
+ streamCutPrimitiveStride = (vertexCount + 7) / 8;
+ streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
+ }
+ }
+
+ uint32_t vertexPrimitiveStride;
+ uint32_t vertexInstanceStride;
+
+ uint32_t cutPrimitiveStride;
+ uint32_t cutInstanceStride;
+
+ uint32_t streamCutPrimitiveStride;
+ uint32_t streamCutInstanceStride;
+};
+
//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
/// @param pDC - pointer to draw context.
void* pCutBuffer,
void* pStreamCutBuffer,
uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims_simd8,
+#endif
simdscalari primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
}
}
-
+
// assemble position
pa.Assemble(VERTEX_POSITION_SLOT, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
}
- const uint32_t vertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
- const uint32_t inputPrimStride = numSimdBatches * vertexStride;
- const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
- uint32_t cutPrimStride;
- uint32_t cutInstanceStride;
-
- if (pState->isSingleStream)
- {
- cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
- cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
- }
- else
- {
- cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
- cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
- }
+#if USE_SIMD16_FRONTEND
+ const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
+#else
+ const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
+#endif
// record valid prims from the frontend to avoid over binning the newly generated
// prims from the GS
+#if USE_SIMD16_FRONTEND
+ uint32_t numInputPrims = numPrims_simd8;
+#else
uint32_t numInputPrims = pa.NumPrims();
+#endif
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
// execute the geometry shader
state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
- tlsGsContext.pStream += instanceStride;
- tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+ tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
+ tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
}
// set up new binner and state for the GS output topology
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (pState->outputTopology)
+ {
+ case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break;
+ case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
+ default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+ }
+ }
+
+#else
PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
- default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
+ default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
}
}
+#endif
// foreach input prim:
// - setup a new PA based on the emitted verts for that prim
// - loop over the new verts, calling PA to assemble each prim
uint32_t totalPrimsGenerated = 0;
for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
{
- uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
- uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
+ uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
+ uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
+
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
uint32_t numEmittedVerts = pVertexCount[inputPrim];
continue;
}
- uint8_t* pBase = pInstanceBase + instance * instanceStride;
- uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
-
+ uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
+ uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
+
uint32_t numAttribs = state.feNumAttributes;
for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
processCutVerts = false;
}
+#if USE_SIMD16_FRONTEND
+ PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+
+#else
PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+#endif
while (gsPa.GetNextStreamOutput())
{
do
{
+#if USE_SIMD16_FRONTEND
+ simd16vector attrib_simd16[3];
+
+ bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+
+#else
bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+#endif
if (assemble)
{
totalPrimsGenerated += gsPa.NumPrims();
if (HasStreamOutT::value)
{
+#if USE_SIMD16_FRONTEND
+ const uint32_t numPrims = gsPa.NumPrims();
+ const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+ const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+ gsPa.useAlternateOffset = false;
+ StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream);
+
+ if (numPrims_hi)
+ {
+ gsPa.useAlternateOffset = true;
+ StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream);
+ }
+#else
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
+#endif
}
if (HasRastT::value && state.soState.streamToRasterizer == stream)
{
+#if USE_SIMD16_FRONTEND
+ simd16scalari vPrimId;
+ // pull primitiveID from the GS output if available
+ if (state.gsState.emitsPrimitiveID)
+ {
+ simd16vector primIdAttrib[3];
+ gsPa.Assemble_simd16(VERTEX_PRIMID_SLOT, primIdAttrib);
+ vPrimId = _simd16_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
+ }
+ else
+ {
+ vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
+ }
+
+ // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
+ simd16scalari vViewPortIdx;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ simd16vector vpiAttrib[3];
+ gsPa.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
+
+ // OOB indices => forced to zero.
+ simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simd16scalari vClearMask = _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib[0].x), vNumViewports);
+ vpiAttrib[0].x = _simd16_and_ps(_simd16_castsi_ps(vClearMask), vpiAttrib[0].x);
+
+ vViewPortIdx = _simd16_castps_si(vpiAttrib[0].x);
+ }
+ else
+ {
+ vViewPortIdx = _simd16_set1_epi32(0);
+ }
+
+ gsPa.useAlternateOffset = false;
+ pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+#else
simdscalari vPrimId;
// pull primitiveID from the GS output if available
if (state.gsState.emitsPrimitiveID)
{
simdvector primIdAttrib[3];
gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
- vPrimId = _simd_castps_si(primIdAttrib[0].x);
+ vPrimId = _simd_castps_si(primIdAttrib[state.frontendState.topologyProvokingVertex].x);
}
else
{
}
pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+#endif
}
}
} while (gsPa.NextPrim());
// update GS pipeline stats
UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
-
+ AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
AR_END(FEGeometryShader, 1);
}
/// @param state - API state
/// @param ppGsOut - pointer to GS output buffer allocation
/// @param ppCutBuffer - pointer to GS output cut buffer allocation
+template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
void **ppStreamCutBuffer)
{
auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
SWR_ASSERT(state.gsState.gsEnable);
+
// allocate arena space to hold GS output verts
// @todo pack attribs
// @todo support multiple streams
- const uint32_t vertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
- uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
- *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
- const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
- const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
- const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
- const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
+ const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
+
+ const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
+
+ *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
// allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
// maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
// allocate space for temporary per-stream cut buffer if multi-stream is enabled
if (state.gsState.isSingleStream)
{
- *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+ const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
+
+ *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
*ppStreamCutBuffer = nullptr;
}
else
{
- *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
- *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
- }
+ const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
+ const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
+ *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
+ *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
+ }
}
//////////////////////////////////////////////////////////////////////////
void* pCutBuffer,
void* pCutStreamBuffer,
uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims_simd8,
+#endif
simdscalari primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
}
SWR_ASSERT(tsCtx);
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (tsState.postDSTopology)
+ {
+ case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
+ case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
+ default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+ }
+ }
+
+#else
PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
- default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
+ default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
}
}
+#endif
SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
hsContext.pCPout = gt_pTessellationThreadData->patchData;
hsContext.PrimitiveID = primID;
memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
#endif
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims = numPrims_simd8;
+#else
uint32_t numPrims = pa.NumPrims();
+#endif
hsContext.mask = GenerateMask(numPrims);
// Run the HS
SWR_TS_TESSELLATED_DATA tsData = { 0 };
AR_BEGIN(FETessellation, pDC->drawId);
TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
+ AR_EVENT(TessPrimCount(1));
AR_END(FETessellation, 0);
if (tsData.NumPrimitives == 0)
// Allocate DS Output memory
uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
+#if USE_SIMD16_FRONTEND
+ size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
+#else
size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+#endif
if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
{
AlignedFree(gt_pTessellationThreadData->pDSOutput);
gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
+#if USE_SIMD16_FRONTEND
+ gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
+#else
gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
+#endif
}
SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+#if USE_SIMD16_FRONTEND
+ dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
+#else
dsContext.vectorStride = requiredDSVectorInvocations;
+#endif
uint32_t dsInvocations = 0;
}
UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
+#if USE_SIMD16_FRONTEND
+ SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
+
+#endif
PA_TESS tessPa(
pDC,
+#if USE_SIMD16_FRONTEND
+ reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
+ dsContext.vectorStride / 2, // simd8 -> simd16
+#else
dsContext.pOutputData,
dsContext.vectorStride,
+#endif
tsState.numDsOutputAttribs,
tsData.ppIndices,
tsData.NumPrimitives,
while (tessPa.HasWork())
{
+#if USE_SIMD16_FRONTEND
+ const uint32_t numPrims = tessPa.NumPrims();
+ const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+ const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+ const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
+ const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+ const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+
+#endif
if (HasGeometryShaderT::value)
{
+#if USE_SIMD16_FRONTEND
+ tessPa.useAlternateOffset = false;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+ if (numPrims_hi)
+ {
+ tessPa.useAlternateOffset = true;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ }
+#else
GeometryShaderStage<HasStreamOutT, HasRastT>(
pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
_simd_set1_epi32(dsContext.PrimitiveID));
+#endif
}
else
{
if (HasStreamOutT::value)
{
+#if USE_SIMD16_FRONTEND
+ tessPa.useAlternateOffset = false;
+ StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0);
+
+ if (numPrims_hi)
+ {
+ tessPa.useAlternateOffset = true;
+ StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0);
+ }
+#else
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
+#endif
}
if (HasRastT::value)
{
- simdvector prim[3]; // Only deal with triangles, lines, or points
+#if USE_SIMD16_FRONTEND
+ simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
+#else
+ simdvector prim[3]; // Only deal with triangles, lines, or points
+#endif
AR_BEGIN(FEPAAssemble, pDC->drawId);
-#if SWR_ENABLE_ASSERTS
bool assemble =
-#endif
+#if USE_SIMD16_FRONTEND
+ tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
+#else
tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
+#endif
AR_END(FEPAAssemble, 1);
SWR_ASSERT(assemble);
SWR_ASSERT(pfnClipFunc);
+#if USE_SIMD16_FRONTEND
+ tessPa.useAlternateOffset = false;
+ pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0));
+#else
pfnClipFunc(pDC, tessPa, workerId, prim,
GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
+#endif
}
}
} // while (tessPa.HasWork())
} // for (uint32_t p = 0; p < numPrims; ++p)
+#if USE_SIMD16_FRONTEND
+ if (gt_pTessellationThreadData->pDSOutput != nullptr)
+ {
+ AlignedFree(gt_pTessellationThreadData->pDSOutput);
+ gt_pTessellationThreadData->pDSOutput = nullptr;
+ }
+ gt_pTessellationThreadData->numDSOutputVectors = 0;
+
+#endif
TSDestroyCtx(tsCtx);
}
+THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
+THREAD uint32_t gVertexStoreSize = 0;
+
//////////////////////////////////////////////////////////////////////////
/// @brief FE handler for SwrDraw.
/// @tparam IsIndexedT - Is indexed drawing enabled
DRAW_WORK& work = *(DRAW_WORK*)pUserData;
const API_STATE& state = GetApiState(pDC);
- __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- SWR_VS_CONTEXT vsContext;
- simdvertex vin;
- int indexSize = 0;
- uint32_t endVertex = work.numVerts;
+ uint32_t indexSize = 0;
+ uint32_t endVertex = work.numVerts;
const int32_t* pLastRequestedIndex = nullptr;
if (IsIndexedT::value)
pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
break;
default:
- SWR_ASSERT(0);
+ SWR_INVALID("Invalid work.type: %d", work.type);
}
}
else
endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
}
- SWR_FETCH_CONTEXT fetchInfo = { 0 };
- fetchInfo.pStreams = &state.vertexBuffers[0];
- fetchInfo.StartInstance = work.startInstance;
- fetchInfo.StartVertex = 0;
-
- vsContext.pVin = &vin;
-
- if (IsIndexedT::value)
- {
- fetchInfo.BaseVertex = work.baseVertex;
-
- // if the entire index buffer isn't being consumed, set the last index
- // so that fetches < a SIMD wide will be masked off
- fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
- if (pLastRequestedIndex < fetchInfo.pLastIndex)
- {
- fetchInfo.pLastIndex = pLastRequestedIndex;
- }
- }
- else
- {
- fetchInfo.StartVertex = work.startVertex;
- }
-
-#ifdef KNOB_ENABLE_RDTSC
+#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
#endif
void* pStreamCutBuffer = nullptr;
if (HasGeometryShaderT::value)
{
- AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#if USE_SIMD16_FRONTEND
+ AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#else
+ AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#endif
}
if (HasTessellationT::value)
pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
}
+ const uint32_t vertexCount = NumVertsPerPrim(state.topology, state.gsState.gsEnable);
+
+ SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
+
+ // grow the vertex store for the PA as necessary
+ if (gVertexStoreSize < vertexCount)
+ {
+ if (pVertexStore != nullptr)
+ {
+ AlignedFree(pVertexStore);
+ }
+
+ while (gVertexStoreSize < vertexCount)
+ {
+#if USE_SIMD16_FRONTEND
+ gVertexStoreSize += 4; // grow in chunks of 4 simd16vertex
+#else
+ gVertexStoreSize += 8; // grow in chunks of 8 simdvertex
+#endif
+ }
+
+ SWR_ASSERT(gVertexStoreSize <= MAX_NUM_VERTS_PER_PRIM);
+
+ pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(gVertexStoreSize * sizeof(pVertexStore[0]), 64));
+
+ SWR_ASSERT(pVertexStore != nullptr);
+ }
+
// choose primitive assembler
- PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
+ PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize);
PA_STATE& pa = paFactory.GetPA();
- /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+#if USE_SIMD16_FRONTEND
+ simdvertex vin_lo;
+ simdvertex vin_hi;
+ SWR_VS_CONTEXT vsContext_lo;
+ SWR_VS_CONTEXT vsContext_hi;
+
+ vsContext_lo.pVin = &vin_lo;
+ vsContext_hi.pVin = &vin_hi;
+ vsContext_lo.AlternateOffset = 0;
+ vsContext_hi.AlternateOffset = 1;
+
+ SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
+
+ fetchInfo_lo.pStreams = &state.vertexBuffers[0];
+ fetchInfo_lo.StartInstance = work.startInstance;
+ fetchInfo_lo.StartVertex = 0;
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo_lo.BaseVertex = work.baseVertex;
+
+ // if the entire index buffer isn't being consumed, set the last index
+ // so that fetches < a SIMD wide will be masked off
+ fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
+ {
+ fetchInfo_lo.pLastIndex = pLastRequestedIndex;
+ }
+ }
+ else
+ {
+ fetchInfo_lo.StartVertex = work.startVertex;
+ }
+
+ SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
+
+ const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
{
- simdscalari vIndex;
uint32_t i = 0;
+ simd16scalari vIndex;
+
if (IsIndexedT::value)
{
- fetchInfo.pIndices = work.pIB;
+ fetchInfo_lo.pIndices = work.pIB;
+ fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH
}
else
{
- vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
- fetchInfo.pIndices = (const int32_t*)&vIndex;
+ vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
+
+ fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
+ fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
}
- fetchInfo.CurInstance = instanceNum;
- vsContext.InstanceID = instanceNum;
+ fetchInfo_lo.CurInstance = instanceNum;
+ fetchInfo_hi.CurInstance = instanceNum;
+
+ vsContext_lo.InstanceID = instanceNum;
+ vsContext_hi.InstanceID = instanceNum;
while (pa.HasWork())
{
- // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
+ // GetNextVsOutput currently has the side effect of updating some PA state machine state.
// So we need to keep this outside of (i < endVertex) check.
- simdmask* pvCutIndices = nullptr;
+
+ simdmask *pvCutIndices_lo = nullptr;
+ simdmask *pvCutIndices_hi = nullptr;
+
if (IsIndexedT::value)
{
- pvCutIndices = &pa.GetNextVsIndices();
+ // simd16mask <=> simdmask[2]
+
+ pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
+ pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
}
- simdvertex& vout = pa.GetNextVsOutput();
- vsContext.pVout = &vout;
+ simd16vertex &vout = pa.GetNextVsOutput();
+
+ vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
+ vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
if (i < endVertex)
{
-
// 1. Execute FS/VS for a single SIMD.
AR_BEGIN(FEFetchShader, pDC->drawId);
- state.pfnFetchFunc(fetchInfo, vin);
+ state.pfnFetchFunc(fetchInfo_lo, vin_lo);
+
+ if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
+ {
+ state.pfnFetchFunc(fetchInfo_hi, vin_hi);
+ }
AR_END(FEFetchShader, 0);
// forward fetch generated vertex IDs to the vertex shader
- vsContext.VertexID = fetchInfo.VertexID;
+ vsContext_lo.VertexID = fetchInfo_lo.VertexID;
+ vsContext_hi.VertexID = fetchInfo_hi.VertexID;
// Setup active mask for vertex shader.
- vsContext.mask = GenerateMask(endVertex - i);
+ vsContext_lo.mask = GenerateMask(endVertex - i);
+ vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
// forward cut mask to the PA
if (IsIndexedT::value)
{
- *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
+ *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
+ *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
}
UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
#endif
{
AR_BEGIN(FEVertexShader, pDC->drawId);
- state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
+ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
+
+ if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
+ {
+ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
+ }
AR_END(FEVertexShader, 0);
UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
// 2. Assemble primitives given the last two SIMD.
do
{
- simdvector prim[MAX_NUM_VERTS_PER_PRIM];
- // PaAssemble returns false if there is not enough verts to assemble.
- AR_BEGIN(FEPAAssemble, pDC->drawId);
- bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
- AR_END(FEPAAssemble, 1);
+ simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
+
+ RDTSC_START(FEPAAssemble);
+ bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
+ RDTSC_STOP(FEPAAssemble, 1, 0);
#if KNOB_ENABLE_TOSS_POINTS
if (!KNOB_TOSS_FETCH)
{
UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
+ const uint32_t numPrims = pa.NumPrims();
+ const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+ const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+ const simd16scalari primID = pa.GetPrimID(work.startPrimID);
+ const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+ const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+
if (HasTessellationT::value)
{
- TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ pa.useAlternateOffset = false;
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+ if (numPrims_hi)
+ {
+ pa.useAlternateOffset = true;
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ }
}
else if (HasGeometryShaderT::value)
{
- GeometryShaderStage<HasStreamOutT, HasRastT>(
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ pa.useAlternateOffset = false;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+ if (numPrims_hi)
+ {
+ pa.useAlternateOffset = true;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ }
}
else
{
// If streamout is enabled then stream vertices out to memory.
if (HasStreamOutT::value)
{
+#if 1
+ pa.useAlternateOffset = false;
+ StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0);
+
+ if (numPrims_hi)
+ {
+ pa.useAlternateOffset = true;
+ StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0);
+ }
+#else
+ pa.useAlternateOffset = false;
StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+#endif
}
if (HasRastT::value)
{
- SWR_ASSERT(pDC->pState->pfnProcessPrims);
- pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
- GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
+ SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
+
+ pa.useAlternateOffset = false;
+ pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si());
}
}
}
}
} while (pa.NextPrim());
- i += KNOB_SIMD_WIDTH;
if (IsIndexedT::value)
{
- fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+ fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
+ fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
}
else
{
- vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+ vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
}
+
+ i += KNOB_SIMD16_WIDTH;
}
+
pa.Reset();
}
- AR_END(FEProcessDraw, numPrims * work.numInstances);
-}
+#else
+ simdvertex vin;
+ SWR_VS_CONTEXT vsContext;
-struct FEDrawChooser
-{
- typedef PFN_FE_WORK_FUNC FuncType;
+ vsContext.pVin = &vin;
- template <typename... ArgsB>
- static FuncType GetFunc()
+ SWR_FETCH_CONTEXT fetchInfo = { 0 };
+
+ fetchInfo.pStreams = &state.vertexBuffers[0];
+ fetchInfo.StartInstance = work.startInstance;
+ fetchInfo.StartVertex = 0;
+
+ if (IsIndexedT::value)
{
- return ProcessDraw<ArgsB...>;
- }
-};
+ fetchInfo.BaseVertex = work.baseVertex;
+ // if the entire index buffer isn't being consumed, set the last index
+ // so that fetches < a SIMD wide will be masked off
+ fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ if (pLastRequestedIndex < fetchInfo.pLastIndex)
+ {
+ fetchInfo.pLastIndex = pLastRequestedIndex;
+ }
+ }
+ else
+ {
+ fetchInfo.StartVertex = work.startVertex;
+ }
-// Selector for correct templated Draw front-end function
-PFN_FE_WORK_FUNC GetProcessDrawFunc(
- bool IsIndexed,
- bool IsCutIndexEnabled,
- bool HasTessellation,
- bool HasGeometryShader,
- bool HasStreamOut,
- bool HasRasterization)
-{
- return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
-}
+ const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes attributes for the backend based on linkage mask and
-/// linkage map. Essentially just doing an SOA->AOS conversion and pack.
-/// @param pDC - Draw context
-/// @param pa - Primitive Assembly state
-/// @param linkageMask - Specifies which VS outputs are routed to PS.
-/// @param pLinkageMap - maps VS attribute slot to PS slot
-/// @param triIndex - Triangle to process attributes for
-/// @param pBuffer - Output result
-template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
-INLINE void ProcessAttributes(
- DRAW_CONTEXT *pDC,
- PA_STATE&pa,
- uint32_t triIndex,
- uint32_t primId,
- float *pBuffer)
-{
- static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
- const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
- // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
- LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
- const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
- const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
-
- static const float constTable[3][4] = {
- {0.0f, 0.0f, 0.0f, 0.0f},
- {0.0f, 0.0f, 0.0f, 1.0f},
- {1.0f, 1.0f, 1.0f, 1.0f}
- };
-
- for (uint32_t i = 0; i < backendState.numAttributes; ++i)
+ /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+ for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
{
- uint32_t inputSlot;
- if (IsSwizzledT::value)
- {
- SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
- inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
+ simdscalari vIndex;
+ uint32_t i = 0;
+ if (IsIndexedT::value)
+ {
+ fetchInfo.pIndices = work.pIB;
}
else
{
- inputSlot = VERTEX_ATTRIB_START_SLOT + i;
+ vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
+ fetchInfo.pIndices = (const int32_t*)&vIndex;
}
- __m128 attrib[3]; // triangle attribs (always 4 wide)
- float* pAttribStart = pBuffer;
+ fetchInfo.CurInstance = instanceNum;
+ vsContext.InstanceID = instanceNum;
- if (HasConstantInterpT::value || IsDegenerate::value)
+ while (pa.HasWork())
{
- if (_bittest(&constantInterpMask, i))
+ // GetNextVsOutput currently has the side effect of updating some PA state machine state.
+ // So we need to keep this outside of (i < endVertex) check.
+ simdmask* pvCutIndices = nullptr;
+ if (IsIndexedT::value)
+ {
+ pvCutIndices = &pa.GetNextVsIndices();
+ }
+
+ simdvertex& vout = pa.GetNextVsOutput();
+ vsContext.pVout = &vout;
+
+ if (i < endVertex)
{
- uint32_t vid;
- uint32_t adjustedTriIndex;
- static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
- static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
- static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
- static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
- static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
-
- switch (topo) {
- case TOP_QUAD_LIST:
- adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
- vid = quadProvokingVertex[triIndex & 1][provokingVertex];
- break;
- case TOP_QUAD_STRIP:
- adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
- vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
- break;
- case TOP_TRIANGLE_STRIP:
- adjustedTriIndex = triIndex;
- vid = (triIndex & 1)
- ? tristripProvokingVertex[provokingVertex]
- : provokingVertex;
- break;
- default:
- adjustedTriIndex = triIndex;
- vid = provokingVertex;
- break;
- }
- pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
+ // 1. Execute FS/VS for a single SIMD.
+ AR_BEGIN(FEFetchShader, pDC->drawId);
+ state.pfnFetchFunc(fetchInfo, vin);
+ AR_END(FEFetchShader, 0);
+
+ // forward fetch generated vertex IDs to the vertex shader
+ vsContext.VertexID = fetchInfo.VertexID;
+
+ // Setup active mask for vertex shader.
+ vsContext.mask = GenerateMask(endVertex - i);
- for (uint32_t i = 0; i < NumVertsT::value; ++i)
+ // forward cut mask to the PA
+ if (IsIndexedT::value)
{
- _mm_store_ps(pBuffer, attrib[vid]);
- pBuffer += 4;
+ *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
}
- }
- else
- {
- pa.AssembleSingle(inputSlot, triIndex, attrib);
- for (uint32_t i = 0; i < NumVertsT::value; ++i)
+ UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
{
- _mm_store_ps(pBuffer, attrib[i]);
- pBuffer += 4;
+ AR_BEGIN(FEVertexShader, pDC->drawId);
+ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
+ AR_END(FEVertexShader, 0);
+
+ UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
}
}
- }
- else
- {
- pa.AssembleSingle(inputSlot, triIndex, attrib);
- for (uint32_t i = 0; i < NumVertsT::value; ++i)
+ // 2. Assemble primitives given the last two SIMD.
+ do
{
- _mm_store_ps(pBuffer, attrib[i]);
- pBuffer += 4;
- }
- }
-
- // pad out the attrib buffer to 3 verts to ensure the triangle
- // interpolation code in the pixel shader works correctly for the
- // 3 topologies - point, line, tri. This effectively zeros out the
- // effect of the missing vertices in the triangle interpolation.
- for (uint32_t v = NumVertsT::value; v < 3; ++v)
- {
- _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
- pBuffer += 4;
- }
+ simdvector prim[MAX_NUM_VERTS_PER_PRIM];
+ // PaAssemble returns false if there is not enough verts to assemble.
+ AR_BEGIN(FEPAAssemble, pDC->drawId);
+ bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
+ AR_END(FEPAAssemble, 1);
- // check for constant source overrides
- if (IsSwizzledT::value)
- {
- uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
- if (mask)
- {
- DWORD comp;
- while (_BitScanForward(&comp, mask))
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
{
- mask &= ~(1 << comp);
-
- float constantValue = 0.0f;
- switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_VS)
+#endif
{
- case SWR_CONSTANT_SOURCE_CONST_0000:
- case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
- case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
- constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
- break;
- case SWR_CONSTANT_SOURCE_PRIM_ID:
- constantValue = *(float*)&primId;
- break;
- }
+ if (assemble)
+ {
+ UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
- // apply constant value to all 3 vertices
- for (uint32_t v = 0; v < 3; ++v)
- {
- pAttribStart[comp + v * 4] = constantValue;
+ if (HasTessellationT::value)
+ {
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+ pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ }
+ else if (HasGeometryShaderT::value)
+ {
+ GeometryShaderStage<HasStreamOutT, HasRastT>(
+ pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ }
+ else
+ {
+ // If streamout is enabled then stream vertices out to memory.
+ if (HasStreamOutT::value)
+ {
+ StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+ }
+
+ if (HasRastT::value)
+ {
+ SWR_ASSERT(pDC->pState->pfnProcessPrims);
+
+ pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
+ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
+ }
+ }
+ }
}
}
+ } while (pa.NextPrim());
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+ }
+ else
+ {
+ vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
}
+
+ i += KNOB_SIMD_WIDTH;
}
+ pa.Reset();
}
-}
+#endif
-typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
+ AR_END(FEProcessDraw, numPrims * work.numInstances);
+}
-struct ProcessAttributesChooser
+struct FEDrawChooser
{
- typedef PFN_PROCESS_ATTRIBUTES FuncType;
+ typedef PFN_FE_WORK_FUNC FuncType;
template <typename... ArgsB>
static FuncType GetFunc()
{
- return ProcessAttributes<ArgsB...>;
+ return ProcessDraw<ArgsB...>;
}
};
-PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
-{
- return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes enabled user clip distances. Loads the active clip
-/// distances from the PA, sets up barycentric equations, and
-/// stores the results to the output buffer
-/// @param pa - Primitive Assembly state
-/// @param primIndex - primitive index to process
-/// @param clipDistMask - mask of enabled clip distances
-/// @param pUserClipBuffer - buffer to store results
-template<uint32_t NumVerts>
-void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
-{
- DWORD clipDist;
- while (_BitScanForward(&clipDist, clipDistMask))
- {
- clipDistMask &= ~(1 << clipDist);
- uint32_t clipSlot = clipDist >> 2;
- uint32_t clipComp = clipDist & 0x3;
- uint32_t clipAttribSlot = clipSlot == 0 ?
- VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
-
- __m128 primClipDist[3];
- pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
-
- float vertClipDist[NumVerts];
- for (uint32_t e = 0; e < NumVerts; ++e)
- {
- OSALIGNSIMD(float) aVertClipDist[4];
- _mm_store_ps(aVertClipDist, primClipDist[e]);
- vertClipDist[e] = aVertClipDist[clipComp];
- };
-
- // setup plane equations for barycentric interpolation in the backend
- float baryCoeff[NumVerts];
- for (uint32_t e = 0; e < NumVerts - 1; ++e)
- {
- baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
- }
- baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
-
- for (uint32_t e = 0; e < NumVerts; ++e)
- {
- *(pUserClipBuffer++) = baryCoeff[e];
- }
- }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert the X,Y coords of a triangle to the requested Fixed
-/// Point precision from FP32.
-template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
-{
- simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
- return _simd_cvtps_epi32(vFixed);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper function to set the X,Y coords of a triangle to the
-/// requested Fixed Point precision from FP32.
-/// @param tri: simdvector[3] of FP triangle verts
-/// @param vXi: fixed point X coords of tri verts
-/// @param vYi: fixed point Y coords of tri verts
-INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
-{
- vXi[0] = fpToFixedPointVertical(tri[0].x);
- vYi[0] = fpToFixedPointVertical(tri[0].y);
- vXi[1] = fpToFixedPointVertical(tri[1].x);
- vYi[1] = fpToFixedPointVertical(tri[1].y);
- vXi[2] = fpToFixedPointVertical(tri[2].x);
- vYi[2] = fpToFixedPointVertical(tri[2].y);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Calculate bounding box for current triangle
-/// @tparam CT: ConservativeRastFETraits type
-/// @param vX: fixed point X position for triangle verts
-/// @param vY: fixed point Y position for triangle verts
-/// @param bbox: fixed point bbox
-/// *Note*: expects vX, vY to be in the correct precision for the type
-/// of rasterization. This avoids unnecessary FP->fixed conversions.
-template <typename CT>
-INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
-{
- simdscalari vMinX = vX[0];
- vMinX = _simd_min_epi32(vMinX, vX[1]);
- vMinX = _simd_min_epi32(vMinX, vX[2]);
-
- simdscalari vMaxX = vX[0];
- vMaxX = _simd_max_epi32(vMaxX, vX[1]);
- vMaxX = _simd_max_epi32(vMaxX, vX[2]);
-
- simdscalari vMinY = vY[0];
- vMinY = _simd_min_epi32(vMinY, vY[1]);
- vMinY = _simd_min_epi32(vMinY, vY[2]);
-
- simdscalari vMaxY = vY[0];
- vMaxY = _simd_max_epi32(vMaxY, vY[1]);
- vMaxY = _simd_max_epi32(vMaxY, vY[2]);
-
- bbox.xmin = vMinX;
- bbox.xmax = vMaxX;
- bbox.ymin = vMinY;
- bbox.ymax = vMaxY;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
-/// Offsets BBox for conservative rast
-template <>
-INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
-{
- // FE conservative rast traits
- typedef FEConservativeRastT CT;
-
- simdscalari vMinX = vX[0];
- vMinX = _simd_min_epi32(vMinX, vX[1]);
- vMinX = _simd_min_epi32(vMinX, vX[2]);
-
- simdscalari vMaxX = vX[0];
- vMaxX = _simd_max_epi32(vMaxX, vX[1]);
- vMaxX = _simd_max_epi32(vMaxX, vX[2]);
-
- simdscalari vMinY = vY[0];
- vMinY = _simd_min_epi32(vMinY, vY[1]);
- vMinY = _simd_min_epi32(vMinY, vY[2]);
-
- simdscalari vMaxY = vY[0];
- vMaxY = _simd_max_epi32(vMaxY, vY[1]);
- vMaxY = _simd_max_epi32(vMaxY, vY[2]);
-
- /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
- /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
- bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
-}
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
-/// culling, viewport transform, etc.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains triangle position data for SIMDs worth of triangles.
-/// @param primID - Primitive ID for each triangle.
-/// @param viewportIdx - viewport array index for each triangle.
-/// @tparam CT - ConservativeRastFETraits
-template <typename CT>
-void BinTriangles(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector tri[3],
- uint32_t triMask,
- simdscalari primID,
- simdscalari viewportIdx)
+// Selector for correct templated Draw front-end function
+PFN_FE_WORK_FUNC GetProcessDrawFunc(
+ bool IsIndexed,
+ bool IsCutIndexEnabled,
+ bool HasTessellation,
+ bool HasGeometryShader,
+ bool HasStreamOut,
+ bool HasRasterization)
{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(FEBinTriangles, pDC->drawId);
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const SWR_FRONTEND_STATE& feState = state.frontendState;
- const SWR_GS_STATE& gsState = state.gsState;
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
-
- simdscalar vRecipW0 = _simd_set1_ps(1.0f);
- simdscalar vRecipW1 = _simd_set1_ps(1.0f);
- simdscalar vRecipW2 = _simd_set1_ps(1.0f);
-
- if (feState.vpTransformDisable)
- {
- // RHW is passed in directly when VP transform is disabled
- vRecipW0 = tri[0].v[3];
- vRecipW1 = tri[1].v[3];
- vRecipW2 = tri[2].v[3];
- }
- else
- {
- // Perspective divide
- vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
- vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
- vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
-
- tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
- tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
- tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
-
- tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
- tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
- tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
-
- tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
- tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
- tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
-
- // Viewport transform to screen space coords
- if (state.gsState.emitsViewportArrayIndex)
- {
- viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
- }
- else
- {
- viewportTransform<3>(tri, state.vpMatrices);
- }
- }
-
- // Adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- tri[0].x = _simd_add_ps(tri[0].x, offset);
- tri[0].y = _simd_add_ps(tri[0].y, offset);
-
- tri[1].x = _simd_add_ps(tri[1].x, offset);
- tri[1].y = _simd_add_ps(tri[1].y, offset);
-
- tri[2].x = _simd_add_ps(tri[2].x, offset);
- tri[2].y = _simd_add_ps(tri[2].y, offset);
-
- simdscalari vXi[3], vYi[3];
- // Set vXi, vYi to required fixed point precision
- FPToFixedPoint(tri, vXi, vYi);
-
- // triangle setup
- simdscalari vAi[3], vBi[3];
- triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
-
- // determinant
- simdscalari vDet[2];
- calcDeterminantIntVertical(vAi, vBi, vDet);
-
- // cull zero area
- int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
- int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
-
- int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
-
- uint32_t origTriMask = triMask;
- // don't cull degenerate triangles if we're conservatively rasterizing
- if(!CT::IsConservativeT::value)
- {
- triMask &= ~cullZeroAreaMask;
- }
-
- // determine front winding tris
- // CW +det
- // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
- maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
- maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
- int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
-
- uint32_t frontWindingTris;
- if (rastState.frontWinding == SWR_FRONTWINDING_CW)
- {
- frontWindingTris = cwTriMask;
- }
- else
- {
- frontWindingTris = ~cwTriMask;
- }
-
- // cull
- uint32_t cullTris;
- switch ((SWR_CULLMODE)rastState.cullMode)
- {
- case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
- case SWR_CULLMODE_NONE: cullTris = 0x0; break;
- case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
- // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
- case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
- default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
- }
-
- triMask &= ~cullTris;
-
- if (origTriMask ^ triMask)
- {
- RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
- }
-
- /// Note: these variable initializations must stay above any 'goto endBenTriangles'
- // compute per tri backface
- uint32_t frontFaceMask = frontWindingTris;
- uint32_t *pPrimID = (uint32_t *)&primID;
- const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
- DWORD triIndex = 0;
- // for center sample pattern, all samples are at pixel center; calculate coverage
- // once at center and broadcast the results in the backend
- const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
- uint32_t edgeEnable;
- PFN_WORK_FUNC pfnWork;
- if(CT::IsConservativeT::value)
- {
- // determine which edges of the degenerate tri, if any, are valid to rasterize.
- // used to call the appropriate templated rasterizer function
- if(cullZeroAreaMask > 0)
- {
- // e0 = v1-v0
- simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
- simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
- uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
-
- // e1 = v2-v1
- simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
- simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
- uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
-
- // e2 = v0-v2
- // if v0 == v1 & v1 == v2, v0 == v2
- uint32_t e2Mask = e0Mask & e1Mask;
- SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
-
- // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
- // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
- e0Mask = pdep_u32(e0Mask, 0x00249249);
- // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
- e1Mask = pdep_u32(e1Mask, 0x00492492);
- // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
- e2Mask = pdep_u32(e2Mask, 0x00924924);
-
- edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
- }
- else
- {
- edgeEnable = 0x00FFFFFF;
- }
- }
- else
- {
- // degenerate triangles won't be sent to rasterizer; just enable all edges
- pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
- (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
- (state.scissorsTileAligned == false));
- }
-
- if (!triMask)
- {
- goto endBinTriangles;
- }
-
- // Calc bounding box of triangles
- simdBBox bbox;
- calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
-
- // determine if triangle falls between pixel centers and discard
- // only discard for non-MSAA case and when conservative rast is disabled
- // (xmin + 127) & ~255
- // (xmax + 128) & ~255
- if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
- {
- origTriMask = triMask;
-
- int cullCenterMask;
- {
- simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
- xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
- simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
- xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
-
- simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
-
- simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
- ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
- simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
- ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
-
- simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
- vMaskV = _simd_or_si(vMaskH, vMaskV);
- cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
- }
-
- triMask &= ~cullCenterMask;
-
- if(origTriMask ^ triMask)
- {
- RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
- }
- }
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- // Gather the AOS effective scissor rects based on the per-prim VP index.
- /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.gsState.emitsViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
-
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
-
- if(CT::IsConservativeT::value)
- {
- // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
- // some area. Bump the xmax/ymax edges out
- simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
- bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
- simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
- bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
- }
-
- // Cull tris completely outside scissor
- {
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- triMask = triMask & ~maskOutsideScissor;
- }
-
- if (!triMask)
- {
- goto endBinTriangles;
- }
-
- // Convert triangle bbox to macrotile units.
- bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
- _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
- _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
- _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
-
- // transpose verts needed for backend
- /// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
- vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
- vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
- vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
- vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai[3];
- pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
- simdscalari vRtaii;
- vRtaii = _simd_castps_si(vRtai[0].x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- // scan remaining valid triangles and bin each separately
- while (_BitScanForward(&triIndex, triMask))
- {
- uint32_t linkageCount = state.backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- bool isDegenerate;
- if(CT::IsConservativeT::value)
- {
- // only rasterize valid edges if we have a degenerate primitive
- int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
- work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
- (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
- (state.scissorsTileAligned == false));
-
- // Degenerate triangles are required to be constant interpolated
- isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
- }
- else
- {
- isDegenerate = false;
- work.pfnWork = pfnWork;
- }
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
- desc.triFlags.primID = pPrimID[triIndex];
- desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
- desc.triFlags.viewportIndex = pViewportIndex[triIndex];
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.pAttribs = pAttribs;
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
-
- // store triangle vertex data
- desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
- ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
- }
-
- for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
- triMask &= ~(1 << triIndex);
- }
-
-endBinTriangles:
- AR_END(FEBinTriangles, 1);
-}
-
-struct FEBinTrianglesChooser
-{
- typedef PFN_PROCESS_PRIMS FuncType;
-
- template <typename... ArgsB>
- static FuncType GetFunc()
- {
- return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
- }
-};
-
-// Selector for correct templated BinTrinagles function
-PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
-{
- return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD points to the backend. Only supports point size of 1
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains point position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each point.
-void BinPoints(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector prim[3],
- uint32_t primMask,
- simdscalari primID,
- simdscalari viewportIdx)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(FEBinPoints, pDC->drawId);
-
- simdvector& primVerts = prim[0];
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_FRONTEND_STATE& feState = state.frontendState;
- const SWR_GS_STATE& gsState = state.gsState;
- const SWR_RASTSTATE& rastState = state.rastState;
- const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
- if (!feState.vpTransformDisable)
- {
- // perspective divide
- simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
- primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
- primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
- primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
-
- // viewport transform to screen coords
- if (state.gsState.emitsViewportArrayIndex)
- {
- viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
- }
- else
- {
- viewportTransform<1>(&primVerts, state.vpMatrices);
- }
- }
-
- // adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- primVerts.x = _simd_add_ps(primVerts.x, offset);
- primVerts.y = _simd_add_ps(primVerts.y, offset);
-
- // convert to fixed point
- simdscalari vXi, vYi;
- vXi = fpToFixedPointVertical(primVerts.x);
- vYi = fpToFixedPointVertical(primVerts.y);
-
- if (CanUseSimplePoints(pDC))
- {
- // adjust for ymin-xmin rule
- vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
- vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
-
- // cull points off the ymin-xmin edge of the viewport
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
-
- // compute macro tile coordinates
- simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMacroX, macroX);
- _simd_store_si((simdscalari*)aMacroY, macroY);
-
- // compute raster tile coordinates
- simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
- simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-
- // compute raster tile relative x,y for coverage mask
- simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
- simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
-
- simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
- simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
-
- OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
- _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
-
- OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
- _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
-
- OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
- _simd_store_ps((float*)aZ, primVerts.z);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai;
- pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai.x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- uint32_t *pPrimID = (uint32_t *)&primID;
- DWORD primIndex = 0;
-
- const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
-
- // scan remaining valid triangles and bin each separately
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- // points are always front facing
- desc.triFlags.frontFacing = 1;
- desc.triFlags.primID = pPrimID[primIndex];
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
- desc.triFlags.viewportIndex = pViewportIndex[primIndex];
-
- work.pfnWork = RasterizeSimplePoint;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store attributes
- float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
- desc.pAttribs = pAttribs;
- desc.numAttribs = linkageCount;
-
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
-
- // store raster tile aligned x, y, perspective correct z
- float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
- desc.pTriBuffer = pTriBuffer;
- *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
- *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
- *pTriBuffer = aZ[primIndex];
-
- uint32_t tX = aTileRelativeX[primIndex];
- uint32_t tY = aTileRelativeY[primIndex];
-
- // pack the relative x,y into the coverageMask, the rasterizer will
- // generate the true coverage mask from it
- work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
-
- // bin it
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
- }
- primMask &= ~(1 << primIndex);
- }
- }
- else
- {
- // non simple points need to be potentially binned to multiple macro tiles
- simdscalar vPointSize;
- if (rastState.pointParam)
- {
- simdvector size[3];
- pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
- vPointSize = size[0].x;
- }
- else
- {
- vPointSize = _simd_set1_ps(rastState.pointSize);
- }
-
- // bloat point to bbox
- simdBBox bbox;
- bbox.xmin = bbox.xmax = vXi;
- bbox.ymin = bbox.ymax = vYi;
-
- simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
- simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
- bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
- bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
- bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
- bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- // Gather the AOS effective scissor rects based on the per-prim VP index.
- /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.gsState.emitsViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
-
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
-
- // Cull bloated points completely outside scissor
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- primMask = primMask & ~maskOutsideScissor;
-
- // Convert bbox to macrotile units.
- bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
- _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
- _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
- _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai[2];
- pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
- _simd_store_ps((float*)aPointSize, vPointSize);
-
- uint32_t *pPrimID = (uint32_t *)&primID;
-
- OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
-
- _simd_store_ps((float*)aPrimVertsX, primVerts.x);
- _simd_store_ps((float*)aPrimVertsY, primVerts.y);
- _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
-
- // scan remaining valid prims and bin each separately
- const SWR_BACKEND_STATE& backendState = state.backendState;
- DWORD primIndex;
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = 1;
- desc.triFlags.primID = pPrimID[primIndex];
- desc.triFlags.pointSize = aPointSize[primIndex];
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
- desc.triFlags.viewportIndex = pViewportIndex[primIndex];
-
- work.pfnWork = RasterizeTriPoint;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
- // store point vertex data
- float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
- desc.pTriBuffer = pTriBuffer;
- *pTriBuffer++ = aPrimVertsX[primIndex];
- *pTriBuffer++ = aPrimVertsY[primIndex];
- *pTriBuffer = aPrimVertsZ[primIndex];
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
- ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
- }
-
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
- for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
-
- primMask &= ~(1 << primIndex);
- }
- }
-
- AR_END(FEBinPoints, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD lines to the backend.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains line position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each line.
-/// @param viewportIdx - Viewport Array Index for each line.
-void BinLines(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector prim[],
- uint32_t primMask,
- simdscalari primID,
- simdscalari viewportIdx)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(FEBinLines, pDC->drawId);
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const SWR_FRONTEND_STATE& feState = state.frontendState;
- const SWR_GS_STATE& gsState = state.gsState;
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
- simdscalar vRecipW0 = _simd_set1_ps(1.0f);
- simdscalar vRecipW1 = _simd_set1_ps(1.0f);
-
- if (!feState.vpTransformDisable)
- {
- // perspective divide
- vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
- vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
-
- prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
- prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
-
- prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
- prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
-
- prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
- prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
-
- // viewport transform to screen coords
- if (state.gsState.emitsViewportArrayIndex)
- {
- viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
- }
- else
- {
- viewportTransform<2>(prim, state.vpMatrices);
- }
- }
-
- // adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- prim[0].x = _simd_add_ps(prim[0].x, offset);
- prim[0].y = _simd_add_ps(prim[0].y, offset);
-
- prim[1].x = _simd_add_ps(prim[1].x, offset);
- prim[1].y = _simd_add_ps(prim[1].y, offset);
-
- // convert to fixed point
- simdscalari vXi[2], vYi[2];
- vXi[0] = fpToFixedPointVertical(prim[0].x);
- vYi[0] = fpToFixedPointVertical(prim[0].y);
- vXi[1] = fpToFixedPointVertical(prim[1].x);
- vYi[1] = fpToFixedPointVertical(prim[1].y);
-
- // compute x-major vs y-major mask
- simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
- simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
- simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
- uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
-
- // cull zero-length lines
- simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
- vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
-
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
-
- uint32_t *pPrimID = (uint32_t *)&primID;
- const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
-
- simdscalar vUnused = _simd_setzero_ps();
-
- // Calc bounding box of lines
- simdBBox bbox;
- bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
- bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
- bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
- bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
-
- // bloat bbox by line width along minor axis
- simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
- simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
- simdBBox bloatBox;
- bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
- bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
- bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
- bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
-
- bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
- bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
- bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
- bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.gsState.emitsViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
-
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
-
- // Cull prims completely outside scissor
- {
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- primMask = primMask & ~maskOutsideScissor;
- }
-
- if (!primMask)
- {
- goto endBinLines;
- }
-
- // Convert triangle bbox to macrotile units.
- bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
- _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
- _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
- _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
-
- // transpose verts needed for backend
- /// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
- vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
- vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
- vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
- vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai[2];
- pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- // scan remaining valid prims and bin each separately
- DWORD primIndex;
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = state.backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = 1;
- desc.triFlags.primID = pPrimID[primIndex];
- desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
- desc.triFlags.viewportIndex = pViewportIndex[primIndex];
-
- work.pfnWork = RasterizeLine;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
- // store line vertex data
- desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
- ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
- }
-
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
- for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
-
- primMask &= ~(1 << primIndex);
- }
-
-endBinLines:
-
- AR_END(FEBinLines, 1);
+ return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
}