+ frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
+
+ // cull
+ uint32_t cullTris;
+ switch ((SWR_CULLMODE)rastState.cullMode)
+ {
+ case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
+ case SWR_CULLMODE_NONE: cullTris = 0x0; break;
+ case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
+ // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
+ case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
+ default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
+ }
+
+ triMask &= ~cullTris;
+
+ if (origTriMask ^ triMask)
+ {
+ RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+ }
+
+ // Simple non-conformant wireframe mode, useful for debugging
+ if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
+ {
+ // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
+ simd16vector line[2];
+ simd16scalar recipW[2];
+ line[0] = tri[0];
+ line[1] = tri[1];
+ recipW[0] = vRecipW0;
+ recipW[1] = vRecipW1;
+ BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+ line[0] = tri[1];
+ line[1] = tri[2];
+ recipW[0] = vRecipW1;
+ recipW[1] = vRecipW2;
+ BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+ line[0] = tri[2];
+ line[1] = tri[0];
+ recipW[0] = vRecipW2;
+ recipW[1] = vRecipW0;
+ BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+ AR_END(FEBinTriangles, 1);
+ return;
+ }
+
+ /// Note: these variable initializations must stay above any 'goto endBenTriangles'
+ // compute per tri backface
+ uint32_t frontFaceMask = frontWindingTris;
+ uint32_t *pPrimID = (uint32_t *)&primID;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
+ DWORD triIndex = 0;
+
+ uint32_t edgeEnable;
+ PFN_WORK_FUNC pfnWork;
+ if (CT::IsConservativeT::value)
+ {
+ // determine which edges of the degenerate tri, if any, are valid to rasterize.
+ // used to call the appropriate templated rasterizer function
+ if (cullZeroAreaMask > 0)
+ {
+ // e0 = v1-v0
+ const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]);
+ const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]);
+
+ uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask)));
+
+ // e1 = v2-v1
+ const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]);
+ const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]);
+
+ uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask)));
+
+ // e2 = v0-v2
+ // if v0 == v1 & v1 == v2, v0 == v2
+ uint32_t e2Mask = e0Mask & e1Mask;
+ SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
+
+ // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
+ // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
+ e0Mask = pdep_u32(e0Mask, 0x00249249);
+
+ // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
+ e1Mask = pdep_u32(e1Mask, 0x00492492);
+
+ // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
+ e2Mask = pdep_u32(e2Mask, 0x00924924);
+
+ edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
+ }
+ else
+ {
+ edgeEnable = 0x00FFFFFF;
+ }
+ }
+ else
+ {
+ // degenerate triangles won't be sent to rasterizer; just enable all edges
+ pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
+ (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, (state.scissorsTileAligned == false));
+ }
+
+ if (!triMask)
+ {
+ goto endBinTriangles;
+ }
+
+ // Calc bounding box of triangles
+ simd16BBox bbox;
+ calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
+
+ // determine if triangle falls between pixel centers and discard
+ // only discard for non-MSAA case and when conservative rast is disabled
+ // (xmin + 127) & ~255
+ // (xmax + 128) & ~255
+ if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
+ (!CT::IsConservativeT::value))
+ {
+ origTriMask = triMask;
+
+ int cullCenterMask;
+
+ {
+ simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127));
+ xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255));
+ simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128));
+ xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255));
+
+ simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax);
+
+ simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127));
+ ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255));
+ simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128));
+ ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255));
+
+ simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax);
+
+ vMaskV = _simd16_or_si(vMaskH, vMaskV);
+ cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV));
+ }
+
+ triMask &= ~cullCenterMask;
+
+ if (origTriMask ^ triMask)
+ {
+ RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+ }
+ }
+
+ // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
+ // Gather the AOS effective scissor rects based on the per-prim VP index.
+ /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
+
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+
+ if (CT::IsConservativeT::value)
+ {
+ // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
+ // some area. Bump the xmax/ymax edges out
+ simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax);
+ bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom);
+ simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax);
+ bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight);
+ }
+
+ // Cull tris completely outside scissor
+ {
+ simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
+ simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
+ simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
+ uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
+ triMask = triMask & ~maskOutsideScissor;
+ }
+
+ if (!triMask)
+ {
+ goto endBinTriangles;
+ }
+
+ // Convert triangle bbox to macrotile units.
+ bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+ bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+ OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
+
+ _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
+ _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
+ _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
+ _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
+
+ // transpose verts needed for backend
+ /// @todo modify BE to take non-transformed verts
+ __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+
+ vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
+ vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
+ vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0));
+ vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0));
+
+ vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1));
+ vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1));
+ vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1));
+ vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1));
+
+ // store render target array index
+ OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+ {
+ simd16vector vRtai[3];
+ pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
+ simd16scalari vRtaii;
+ vRtaii = _simd16_castps_si(vRtai[0].x);
+ _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
+ }
+ else
+ {
+ _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
+ }
+
+endBinTriangles:
+
+
+ // scan remaining valid triangles and bin each separately
+ while (_BitScanForward(&triIndex, triMask))
+ {
+ uint32_t linkageCount = state.backendState.numAttributes;
+ uint32_t numScalarAttribs = linkageCount * 4;
+
+ BE_WORK work;
+ work.type = DRAW;
+
+ bool isDegenerate;
+ if (CT::IsConservativeT::value)
+ {
+ // only rasterize valid edges if we have a degenerate primitive
+ int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
+ work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
+ (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, (state.scissorsTileAligned == false));
+
+ // Degenerate triangles are required to be constant interpolated
+ isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
+ }
+ else
+ {
+ isDegenerate = false;
+ work.pfnWork = pfnWork;
+ }
+
+ // Select attribute processor
+ PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
+ state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
+
+ TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+ desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
+ desc.triFlags.primID = pPrimID[triIndex];
+ desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[triIndex];
+
+ auto pArena = pDC->pArena;
+ SWR_ASSERT(pArena != nullptr);
+
+ // store active attribs
+ float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+ desc.pAttribs = pAttribs;
+ desc.numAttribs = linkageCount;
+ pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
+
+ // store triangle vertex data
+ desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
+
+ {
+ const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
+ const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH
+
+ _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
+ _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
+ _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
+ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
+ }
+
+ // store user clip distances
+ if (rastState.clipDistanceMask)
+ {
+ uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
+ desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
+ ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
+ }
+
+ for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
+ {
+ for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
+ {
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+ {
+ pTileMgr->enqueue(x, y, &work);
+ }
+ }
+ }
+
+ triMask &= ~(1 << triIndex);
+ }
+
+ AR_END(FEBinTriangles, 1);
+}
+
+#endif
+struct FEBinTrianglesChooser
+{
+ typedef PFN_PROCESS_PRIMS FuncType;
+
+ template <typename... ArgsB>
+ static FuncType GetFunc()
+ {
+ return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
+ }
+};
+
+// Selector for correct templated BinTrinagles function
+PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
+{
+ return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
+}
+
+#if USE_SIMD16_FRONTEND
+struct FEBinTrianglesChooser_simd16
+{
+ typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
+
+ template <typename... ArgsB>
+ static FuncType GetFunc()
+ {
+ return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
+ }
+};
+
+// Selector for correct templated BinTrinagles function
+PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
+{
+ return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
+}
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin SIMD points to the backend. Only supports point size of 1
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains point position data for SIMDs worth of points.
+/// @param primID - Primitive ID for each point.
+void BinPoints(
+ DRAW_CONTEXT *pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simdvector prim[3],
+ uint32_t primMask,
+ simdscalari primID,
+ simdscalari viewportIdx)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+
+ AR_BEGIN(FEBinPoints, pDC->drawId);
+
+ simdvector& primVerts = prim[0];
+
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_FRONTEND_STATE& feState = state.frontendState;
+ const SWR_GS_STATE& gsState = state.gsState;
+ const SWR_RASTSTATE& rastState = state.rastState;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
+
+ // Select attribute processor
+ PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
+ state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
+
+ if (!feState.vpTransformDisable)
+ {
+ // perspective divide
+ simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
+ primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
+ primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
+ primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
+
+ // viewport transform to screen coords
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
+ }
+ else
+ {
+ viewportTransform<1>(&primVerts, state.vpMatrices);
+ }
+ }
+
+ // adjust for pixel center location
+ simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+ primVerts.x = _simd_add_ps(primVerts.x, offset);
+ primVerts.y = _simd_add_ps(primVerts.y, offset);
+
+ // convert to fixed point
+ simdscalari vXi, vYi;
+ vXi = fpToFixedPointVertical(primVerts.x);
+ vYi = fpToFixedPointVertical(primVerts.y);
+
+ if (CanUseSimplePoints(pDC))
+ {
+ // adjust for ymin-xmin rule
+ vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
+ vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
+
+ // cull points off the ymin-xmin edge of the viewport
+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
+ primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
+
+ // compute macro tile coordinates
+ simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+ simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+ OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aMacroX, macroX);
+ _simd_store_si((simdscalari*)aMacroY, macroY);
+
+ // compute raster tile coordinates
+ simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+ simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+
+ // compute raster tile relative x,y for coverage mask
+ simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
+ simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
+
+ simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
+ simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
+
+ OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
+ OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
+ _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
+
+ OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
+ OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
+ _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
+ _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
+
+ OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
+ _simd_store_ps((float*)aZ, primVerts.z);
+
+ // store render target array index
+ OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+ if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+ {
+ simdvector vRtai;
+ pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
+ simdscalari vRtaii = _simd_castps_si(vRtai.x);
+ _simd_store_si((simdscalari*)aRTAI, vRtaii);
+ }
+ else
+ {
+ _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+ }
+
+ uint32_t *pPrimID = (uint32_t *)&primID;
+ DWORD primIndex = 0;
+
+ const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
+
+ // scan remaining valid triangles and bin each separately
+ while (_BitScanForward(&primIndex, primMask))
+ {
+ uint32_t linkageCount = backendState.numAttributes;
+ uint32_t numScalarAttribs = linkageCount * 4;
+
+ BE_WORK work;
+ work.type = DRAW;
+
+ TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+ // points are always front facing
+ desc.triFlags.frontFacing = 1;
+ desc.triFlags.primID = pPrimID[primIndex];
+ desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+ desc.triFlags.viewportIndex = pViewportIndex[primIndex];
+
+ work.pfnWork = RasterizeSimplePoint;
+
+ auto pArena = pDC->pArena;
+ SWR_ASSERT(pArena != nullptr);
+
+ // store attributes
+ float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
+ desc.pAttribs = pAttribs;
+ desc.numAttribs = linkageCount;
+
+ pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
+
+ // store raster tile aligned x, y, perspective correct z
+ float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
+ desc.pTriBuffer = pTriBuffer;
+ *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
+ *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
+ *pTriBuffer = aZ[primIndex];
+
+ uint32_t tX = aTileRelativeX[primIndex];
+ uint32_t tY = aTileRelativeY[primIndex];
+
+ // pack the relative x,y into the coverageMask, the rasterizer will
+ // generate the true coverage mask from it
+ work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
+
+ // bin it
+ MacroTileMgr *pTileMgr = pDC->pTileMgr;
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+ {
+ pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
+ }
+ primMask &= ~(1 << primIndex);
+ }
+ }
+ else
+ {
+ // non simple points need to be potentially binned to multiple macro tiles
+ simdscalar vPointSize;
+ if (rastState.pointParam)