THREAD SWR_GS_CONTEXT tlsGsContext;
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
-struct GsBufferInfo
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
{
- GsBufferInfo(const SWR_GS_STATE &gsState)
- {
- const uint32_t vertexCount = gsState.maxNumVerts;
- const uint32_t vertexStride = sizeof(SIMDVERTEX);
- const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
+ uint8_t* pGsIn;
+ uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+ uint8_t* pGsTransposed;
+ void* pStreamCutBuffer;
+};
- vertexPrimitiveStride = vertexStride * numSimdBatches;
- vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+//////////////////////////////////////////////////////////////////////////
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template<typename SIMD_T, uint32_t SimdWidth>
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
+{
+ uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+ uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
- if (gsState.isSingleStream)
- {
- cutPrimitiveStride = (vertexCount + 7) / 8;
- cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+ OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
- streamCutPrimitiveStride = 0;
- streamCutInstanceStride = 0;
- }
- else
- {
- cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
- cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
-
- streamCutPrimitiveStride = (vertexCount + 7) / 8;
- streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
- }
+ for (uint32_t i = 0; i < SimdWidth; ++i)
+ {
+ gatherOffsets[i] = srcVertexStride * i;
}
+ auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);
- uint32_t vertexPrimitiveStride;
- uint32_t vertexInstanceStride;
+ uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+ uint32_t remainingVerts = numVerts;
- uint32_t cutPrimitiveStride;
- uint32_t cutInstanceStride;
+ for (uint32_t s = 0; s < numSimd; ++s)
+ {
+ uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+ uint8_t* pDstBase = pDst + s * dstVertexStride;
- uint32_t streamCutPrimitiveStride;
- uint32_t streamCutInstanceStride;
-};
+ // Compute mask to prevent src overflow
+ uint32_t mask = std::min(remainingVerts, SimdWidth);
+ mask = GenMask(mask);
+ auto vMask = SIMD_T::vmask_ps(mask);
+ auto viMask = SIMD_T::castps_si(vMask);
+
+ for (uint32_t a = 0; a < numAttribs; ++a)
+ {
+ auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
+ auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
+ auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
+ auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
+
+ SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
+ SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
+ SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
+ SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);
+
+ pSrcBase += sizeof(float) * 4;
+ pDstBase += sizeof(typename SIMD_T::Float) * 4;
+ }
+ remainingVerts -= SimdWidth;
+ }
+}
//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
DRAW_CONTEXT *pDC,
uint32_t workerId,
PA_STATE& pa,
- void* pGsOut,
- void* pCutBuffer,
- void* pStreamCutBuffer,
+ GsBuffers* pGsBuffers,
uint32_t* pSoPrimData,
#if USE_SIMD16_FRONTEND
uint32_t numPrims_simd8,
const API_STATE& state = GetApiState(pDC);
const SWR_GS_STATE* pState = &state.gsState;
- SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
- SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
+ static uint8_t sNullBuffer[1024] = { 0 };
- tlsGsContext.pStream = (uint8_t*)pGsOut;
- tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ tlsGsContext.pStreams[i] = pGsBuffers->pGsOut[i];
+ }
+ tlsGsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
tlsGsContext.PrimitiveID = primID;
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
// assemble all attributes for the input primitive
+ tlsGsContext.inputVertStride = pState->inputVertStride;
for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
{
+ uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
uint32_t attribSlot = pState->vertexAttribOffset + slot;
- pa.Assemble(attribSlot, attrib);
+ pa.Assemble(srcAttribSlot, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
+ tlsGsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
}
}
pa.Assemble(VERTEX_POSITION_SLOT, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+ tlsGsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
}
-#if USE_SIMD16_FRONTEND
- const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
-#else
- const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
-#endif
-
// record valid prims from the frontend to avoid over binning the newly generated
// prims from the GS
#if USE_SIMD16_FRONTEND
// execute the geometry shader
state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
- tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
- tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ tlsGsContext.pStreams[i] += pState->allocationSize;
+ }
}
// set up new binner and state for the GS output topology
// foreach input prim:
// - setup a new PA based on the emitted verts for that prim
// - loop over the new verts, calling PA to assemble each prim
- uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
uint32_t* pPrimitiveId = (uint32_t*)&primID;
uint32_t totalPrimsGenerated = 0;
for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
{
- uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
- uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
+ uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
+
+ // Vertex count is either emitted by shader or static
+ uint32_t vertexCount = 0;
+ if (pState->staticVertexCount)
+ {
+ vertexCount = pState->staticVertexCount;
+ }
+ else
+ {
+ // If emitted in shader, it should be the stored in the first dword of the output buffer
+ vertexCount = *(uint32_t*)pInstanceBase;
+ }
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
- uint32_t numEmittedVerts = pVertexCount[inputPrim];
+ uint32_t numEmittedVerts = vertexCount;
if (numEmittedVerts == 0)
{
continue;
}
- uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
- uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
+ uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
+ uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
+ uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
+
+#if USE_SIMD16_FRONTEND
+ TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
+#else
+ TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
+#endif
uint32_t numAttribs = state.feNumAttributes;
for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
{
bool processCutVerts = false;
-
uint8_t* pCutBuffer = pCutBase;
// assign default stream ID, only relevant when GS is outputting a single stream
}
// multi-stream output, need to translate StreamID buffer to a cut buffer
- ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
- pCutBuffer = (uint8_t*)pStreamCutBuffer;
+ ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
+ pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
processCutVerts = false;
}
#if USE_SIMD16_FRONTEND
- PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+ PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
#else
- PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+ PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
#endif
while (gsPa.GetNextStreamOutput())
/// @param state - API state
/// @param ppGsOut - pointer to GS output buffer allocation
/// @param ppCutBuffer - pointer to GS output cut buffer allocation
-template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
- void **ppStreamCutBuffer)
+template<typename SIMD_T, uint32_t SIMD_WIDTH>
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
{
auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
SWR_ASSERT(state.gsState.gsEnable);
- // allocate arena space to hold GS output verts
- // @todo pack attribs
- // @todo support multiple streams
+ const SWR_GS_STATE& gsState = state.gsState;
- const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
+ // Allocate storage for vertex inputs
+ uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
+ pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
- const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
+ // Allocate arena space to hold GS output verts
+ const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
- *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
+ }
- // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
- // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
+ // Allocate storage for transposed GS output
+ uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
+ uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
+ pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
- // allocate space for temporary per-stream cut buffer if multi-stream is enabled
+ // Allocate storage to hold temporary stream->cut buffer, if necessary
if (state.gsState.isSingleStream)
{
- const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
-
- *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
- *ppStreamCutBuffer = nullptr;
+ pGsBuffers->pStreamCutBuffer = nullptr;
}
else
{
- const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
- const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
-
- *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
- *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
+ pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
}
}
DRAW_CONTEXT *pDC,
uint32_t workerId,
PA_STATE& pa,
- void* pGsOut,
- void* pCutBuffer,
- void* pCutStreamBuffer,
+ GsBuffers* pGsBuffers,
uint32_t* pSoPrimData,
#if USE_SIMD16_FRONTEND
uint32_t numPrims_simd8,
{
#if USE_SIMD16_FRONTEND
tessPa.useAlternateOffset = false;
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
if (numPrims_hi)
{
tessPa.useAlternateOffset = true;
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
}
#else
GeometryShaderStage<HasStreamOutT, HasRastT>(
- pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
- _simd_set1_epi32(dsContext.PrimitiveID));
+ pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
#endif
}
else
uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
#endif
- void* pGsOut = nullptr;
- void* pCutBuffer = nullptr;
- void* pStreamCutBuffer = nullptr;
+ GsBuffers gsBuffers;
if (HasGeometryShaderT::value)
{
#if USE_SIMD16_FRONTEND
- AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+ AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
#else
- AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+ AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
#endif
}
if (HasTessellationT::value)
{
pa.useAlternateOffset = false;
- TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
if (numPrims_hi)
{
pa.useAlternateOffset = true;
- TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
}
}
else if (HasGeometryShaderT::value)
{
pa.useAlternateOffset = false;
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
if (numPrims_hi)
{
pa.useAlternateOffset = true;
- GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
}
}
else
if (HasTessellationT::value)
{
TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
}
else if (HasGeometryShaderT::value)
{
GeometryShaderStage<HasStreamOutT, HasRastT>(
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
}
else
{
Value *attrib =
LOAD(GEP(iface->pVtxAttribMap, {C(0), unwrap(attrib_index)}));
- Value *pInput =
- LOAD(GEP(iface->pGsCtx,
- {C(0),
- C(SWR_GS_CONTEXT_vert),
- unwrap(vertex_index),
- C(0),
- attrib,
- unwrap(swizzle_index)}));
+ Value *pVertex = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pVerts});
+ Value *pInputVertStride = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_inputVertStride});
+
+ Value *pVector = ADD(MUL(unwrap(vertex_index), pInputVertStride), attrib);
+
+ Value *pInput = LOAD(GEP(pVertex, {pVector, unwrap(swizzle_index)}));
return wrap(pInput);
}
+// GS output stream layout
+#define VERTEX_COUNT_SIZE 32
+#define CONTROL_HEADER_SIZE (8*32)
+
void
BuilderSWR::swr_gs_llvm_emit_vertex(const struct lp_build_tgsi_gs_iface *gs_base,
struct lp_build_tgsi_context * bld_base,
LLVMValueRef emitted_vertices_vec)
{
swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
- SWR_GS_STATE *pGS = iface->pGsState;
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
-#if USE_SIMD16_FRONTEND
- const uint32_t simdVertexStride = sizeof(simdvertex) * 2;
- const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);
-#else
- const uint32_t simdVertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;
-#endif
- const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;
-
- Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });
- Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });
- Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));
+ const uint32_t headerSize = VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE;
+ const uint32_t attribSize = 4 * sizeof(float);
+ const uint32_t vertSize = attribSize * SWR_VTX_NUM_SLOTS;
+ Value *pVertexOffset = MUL(unwrap(emitted_vertices_vec), VIMMED1(vertSize));
- Value *vOffsets = C({
- inputPrimStride * 0,
- inputPrimStride * 1,
- inputPrimStride * 2,
- inputPrimStride * 3,
- inputPrimStride * 4,
- inputPrimStride * 5,
- inputPrimStride * 6,
- inputPrimStride * 7 } );
+ Value *vMask = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_mask});
+ Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, mVWidth));
-#if USE_SIMD16_FRONTEND
- const uint32_t simdShift = log2(mVWidth * 2);
- Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);
-#else
- const uint32_t simdShift = log2(mVWidth);
- Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);
-#endif
- Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);
+ Value *pStack = STACKSAVE();
+ Value *pTmpPtr = ALLOCA(mFP32Ty, C(4)); // used for dummy write for lane masking
for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
uint32_t attribSlot = attrib;
}
}
-#if USE_SIMD16_FRONTEND
- Value *vOffsetsAttrib =
- ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));
- vOffsetsAttrib =
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));
-#else
- Value *vOffsetsAttrib =
- ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));
- vOffsetsAttrib =
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));
-#endif
- vOffsetsAttrib =
- ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));
+ Value *pOutputOffset = ADD(pVertexOffset, VIMMED1(headerSize + attribSize * attribSlot)); // + sgvChannel ?
- for (uint32_t channel = 0; channel < 4; ++channel) {
- Value *vPtrs = GEP(pStream, vOffsetsAttrib);
- Value *vData;
+ for (uint32_t lane = 0; lane < mVWidth; ++lane) {
+ Value *pLaneOffset = VEXTRACT(pOutputOffset, C(lane));
+ Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+ Value *pStreamOffset = GEP(pStream, pLaneOffset);
+ pStreamOffset = BITCAST(pStreamOffset, mFP32PtrTy);
- if (attribSlot == VERTEX_SGV_SLOT)
- vData = LOAD(unwrap(outputs[attrib][0]));
- else
- vData = LOAD(unwrap(outputs[attrib][channel]));
+ Value *pLaneMask = VEXTRACT(vMask1, C(lane));
+ pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);
- if (attribSlot != VERTEX_SGV_SLOT ||
- sgvChannel == channel) {
- vPtrs = BITCAST(vPtrs,
- VectorType::get(PointerType::get(mFP32Ty, 0), 8));
+ for (uint32_t channel = 0; channel < 4; ++channel) {
+ Value *vData;
- MASKED_SCATTER(vData, vPtrs, 32, vMask1);
- }
+ if (attribSlot == VERTEX_SGV_SLOT)
+ vData = LOAD(unwrap(outputs[attrib][0]));
+ else
+ vData = LOAD(unwrap(outputs[attrib][channel]));
-#if USE_SIMD16_FRONTEND
- vOffsetsAttrib =
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));
-#else
- vOffsetsAttrib =
- ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));
-#endif
+ if (attribSlot != VERTEX_SGV_SLOT ||
+ sgvChannel == channel) {
+ vData = VEXTRACT(vData, C(lane));
+ STORE(vData, pStreamOffset);
+ }
+ pStreamOffset = GEP(pStreamOffset, C(1));
+ }
}
}
+
+ STACKRESTORE(pStack);
}
void
LLVMValueRef emitted_prims_vec)
{
swr_gs_llvm_iface *iface = (swr_gs_llvm_iface*)gs_base;
- SWR_GS_STATE *pGS = iface->pGsState;
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
- Value *pCutBuffer =
- LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer});
Value *vMask = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_mask });
Value *vMask1 = TRUNC(vMask, VectorType::get(mInt1Ty, 8));
mask = AND(mask, cmpMask);
vMask1 = TRUNC(mask, VectorType::get(mInt1Ty, 8));
- const uint32_t cutPrimStride =
- (pGS->maxNumVerts + JM()->mVWidth - 1) / JM()->mVWidth;
- Value *vOffsets = C({
- (uint32_t)(cutPrimStride * 0),
- (uint32_t)(cutPrimStride * 1),
- (uint32_t)(cutPrimStride * 2),
- (uint32_t)(cutPrimStride * 3),
- (uint32_t)(cutPrimStride * 4),
- (uint32_t)(cutPrimStride * 5),
- (uint32_t)(cutPrimStride * 6),
- (uint32_t)(cutPrimStride * 7) } );
-
vCount = SUB(vCount, VIMMED1(1));
- Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), vOffsets);
+ Value *vOffset = ADD(UDIV(vCount, VIMMED1(8)), VIMMED1(VERTEX_COUNT_SIZE));
Value *vValue = SHL(VIMMED1(1), UREM(vCount, VIMMED1(8)));
vValue = TRUNC(vValue, VectorType::get(mInt8Ty, 8));
- Value *vPtrs = GEP(pCutBuffer, vOffset);
- vPtrs =
- BITCAST(vPtrs, VectorType::get(PointerType::get(mInt8Ty, 0), JM()->mVWidth));
+ Value *pStack = STACKSAVE();
+ Value *pTmpPtr = ALLOCA(mInt8Ty, C(4)); // used for dummy read/write for lane masking
+
+ for (uint32_t lane = 0; lane < mVWidth; ++lane) {
+ Value *vLaneOffset = VEXTRACT(vOffset, C(lane));
+ Value *pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+ Value *pStreamOffset = GEP(pStream, vLaneOffset);
+
+ Value *pLaneMask = VEXTRACT(vMask1, C(lane));
+ pStreamOffset = SELECT(pLaneMask, pStreamOffset, pTmpPtr);
- Value *vGather = MASKED_GATHER(vPtrs, 32, vMask1);
- vValue = OR(vGather, vValue);
- MASKED_SCATTER(vValue, vPtrs, 32, vMask1);
+ Value *vVal = LOAD(pStreamOffset);
+ vVal = OR(vVal, VEXTRACT(vValue, C(lane)));
+ STORE(vVal, pStreamOffset);
+ }
+
+ STACKRESTORE(pStack);
}
void
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
- STORE(unwrap(total_emitted_vertices_vec), iface->pGsCtx, {0, SWR_GS_CONTEXT_vertexCount});
+ // Store emit count to each output stream in the first DWORD
+ for (uint32_t lane = 0; lane < mVWidth; ++lane)
+ {
+ Value* pStream = LOAD(iface->pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+ pStream = BITCAST(pStream, mInt32PtrTy);
+ Value* pLaneCount = VEXTRACT(unwrap(total_emitted_vertices_vec), C(lane));
+ STORE(pLaneCount, pStream);
+ }
}
PFN_GS_FUNC
SWR_GS_STATE *pGS = &ctx->gs->gsState;
struct tgsi_shader_info *info = &ctx->gs->info.base;
+ memset(pGS, 0, sizeof(*pGS));
+
pGS->gsEnable = true;
pGS->numInputAttribs = info->num_inputs;
pGS->singleStreamID = 0;
pGS->vertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
+ pGS->srcVertexAttribOffset = VERTEX_ATTRIB_START_SLOT; // TODO: optimize
+ pGS->inputVertStride = pGS->numInputAttribs + pGS->vertexAttribOffset;
+ pGS->outputVertexSize = SWR_VTX_NUM_SLOTS;
+ pGS->controlDataSize = 8; // GS ouputs max of 8 32B units
+ pGS->controlDataOffset = VERTEX_COUNT_SIZE;
+ pGS->outputVertexOffset = pGS->controlDataOffset + CONTROL_HEADER_SIZE;
+
+ pGS->allocationSize =
+ VERTEX_COUNT_SIZE + // vertex count
+ CONTROL_HEADER_SIZE + // control header
+ (SWR_VTX_NUM_SLOTS * 16) * // sizeof vertex
+ pGS->maxNumVerts; // num verts
struct swr_geometry_shader *gs = ctx->gs;
lp_type_float_vec(32, 32 * 8), wrap(mask_val));
// zero out cut buffer so we can load/modify/store bits
- MEMSET(LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pCutOrStreamIdBuffer}),
- C((char)0),
- pGS->instanceCount * ((pGS->maxNumVerts + 7) / 8) * JM()->mVWidth,
- sizeof(float) * KNOB_SIMD_WIDTH);
+ for (uint32_t lane = 0; lane < mVWidth; ++lane)
+ {
+ Value* pStream = LOAD(pGsCtx, {0, SWR_GS_CONTEXT_pStreams, lane});
+ MEMSET(pStream, C((char)0), VERTEX_COUNT_SIZE + CONTROL_HEADER_SIZE, sizeof(float) * KNOB_SIMD_WIDTH);
+ }
struct swr_gs_llvm_iface gs_iface;
gs_iface.base.fetch_input = ::swr_gs_llvm_fetch_input;