THREAD SWR_GS_CONTEXT tlsGsContext;
-#if USE_SIMD16_FRONTEND
-THREAD simd16vertex tempVertex_simd16[128];
-
-#endif
template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
struct GsBufferInfo
{
tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
}
+#if USE_SIMD16_FRONTEND
+ const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
+#else
const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
+#endif
// record valid prims from the frontend to avoid over binning the newly generated
// prims from the GS
}
#if USE_SIMD16_FRONTEND
- // TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex
-
- SWR_ASSERT(numEmittedVerts <= 256);
-
- PackPairsOfSimdVertexIntoSimd16Vertex(
- tempVertex_simd16,
- reinterpret_cast<const simdvertex *>(pBase),
- numEmittedVerts,
- SWR_VTX_NUM_SLOTS);
-
-#endif
-#if USE_SIMD16_FRONTEND
- PA_STATE_CUT gsPa(pDC, reinterpret_cast<uint8_t *>(tempVertex_simd16), numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+ PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
#else
PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
+#if USE_SIMD16_FRONTEND
+ const uint32_t simdVertexStride = sizeof(simdvertex) * 2;
+ const uint32_t numSimdBatches = (pGS->maxNumVerts + (mVWidth * 2) - 1) / (mVWidth * 2);
+#else
const uint32_t simdVertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (pGS->maxNumVerts + 7) / 8;
+ const uint32_t numSimdBatches = (pGS->maxNumVerts + mVWidth - 1) / mVWidth;
+#endif
const uint32_t inputPrimStride = numSimdBatches * simdVertexStride;
Value *pStream = LOAD(iface->pGsCtx, { 0, SWR_GS_CONTEXT_pStream });
inputPrimStride * 6,
inputPrimStride * 7 } );
- Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), 3);
- Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), 7);
+#if USE_SIMD16_FRONTEND
+ const uint32_t simdShift = log2(mVWidth * 2);
+ Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), (mVWidth * 2) - 1);
+#else
+ const uint32_t simdShift = log2(mVWidth);
+ Value *vSimdSlot = AND(unwrap(emitted_vertices_vec), mVWidth - 1);
+#endif
+ Value *vVertexSlot = ASHR(unwrap(emitted_vertices_vec), simdShift);
for (uint32_t attrib = 0; attrib < iface->num_outputs; ++attrib) {
uint32_t attribSlot = attrib;
else if (iface->info->output_semantic_name[attrib] == TGSI_SEMANTIC_LAYER)
attribSlot = VERTEX_RTAI_SLOT;
+#if USE_SIMD16_FRONTEND
+ Value *vOffsetsAttrib =
+ ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex) * 2)));
+ vOffsetsAttrib =
+ ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector) * 2)));
+#else
Value *vOffsetsAttrib =
ADD(vOffsets, MUL(vVertexSlot, VIMMED1((uint32_t)sizeof(simdvertex))));
vOffsetsAttrib =
ADD(vOffsetsAttrib, VIMMED1((uint32_t)(attribSlot*sizeof(simdvector))));
+#endif
vOffsetsAttrib =
ADD(vOffsetsAttrib, MUL(vSimdSlot, VIMMED1((uint32_t)sizeof(float))));
MASKED_SCATTER(vData, vPtrs, 32, vMask1);
+#if USE_SIMD16_FRONTEND
+ vOffsetsAttrib =
+ ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar) * 2));
+#else
vOffsetsAttrib =
ADD(vOffsetsAttrib, VIMMED1((uint32_t)sizeof(simdscalar)));
+#endif
}
}
}