22996c5a5d187d5f2e1c9b49f8dc634504cb6d28
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(
41 DRAW_CONTEXT *pDC,
42 PA_STATE &pa,
43 uint32_t workerId,
44 typename SIMD_T::Vec4 prim[],
45 typename SIMD_T::Float recipW[],
46 uint32_t primMask,
47 typename SIMD_T::Integer const &primID,
48 typename SIMD_T::Integer const &viewportIdx);
49
50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
51 void BinPostSetupPointsImpl(
52 DRAW_CONTEXT *pDC,
53 PA_STATE &pa,
54 uint32_t workerId,
55 typename SIMD_T::Vec4 prim[],
56 uint32_t primMask,
57 typename SIMD_T::Integer const &primID,
58 typename SIMD_T::Integer const &viewportIdx);
59
60 //////////////////////////////////////////////////////////////////////////
61 /// @brief Processes attributes for the backend based on linkage mask and
62 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
63 /// @param pDC - Draw context
64 /// @param pa - Primitive Assembly state
65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
66 /// @param pLinkageMap - maps VS attribute slot to PS slot
67 /// @param triIndex - Triangle to process attributes for
68 /// @param pBuffer - Output result
69 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
70 INLINE void ProcessAttributes(
71 DRAW_CONTEXT *pDC,
72 PA_STATE&pa,
73 uint32_t triIndex,
74 uint32_t primId,
75 float *pBuffer)
76 {
77 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
78 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
79 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
80 uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
81 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
82 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
83
84 static const float constTable[3][4] = {
85 { 0.0f, 0.0f, 0.0f, 0.0f },
86 { 0.0f, 0.0f, 0.0f, 1.0f },
87 { 1.0f, 1.0f, 1.0f, 1.0f }
88 };
89
90 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
91 {
92 uint32_t inputSlot;
93 if (IsSwizzledT::value)
94 {
95 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
96 inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
97
98 }
99 else
100 {
101 inputSlot = backendState.vertexAttribOffset + i;
102 }
103
104 simd4scalar attrib[3]; // triangle attribs (always 4 wide)
105 float* pAttribStart = pBuffer;
106
107 if (HasConstantInterpT::value || IsDegenerate::value)
108 {
109 if (CheckBit(constantInterpMask, i))
110 {
111 uint32_t vid;
112 uint32_t adjustedTriIndex;
113 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
114 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
115 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
116 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
117 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
118
119 switch (topo) {
120 case TOP_QUAD_LIST:
121 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
122 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
123 break;
124 case TOP_QUAD_STRIP:
125 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
126 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
127 break;
128 case TOP_TRIANGLE_STRIP:
129 adjustedTriIndex = triIndex;
130 vid = (triIndex & 1)
131 ? tristripProvokingVertex[provokingVertex]
132 : provokingVertex;
133 break;
134 default:
135 adjustedTriIndex = triIndex;
136 vid = provokingVertex;
137 break;
138 }
139
140 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
141
142 for (uint32_t i = 0; i < NumVertsT::value; ++i)
143 {
144 SIMD128::store_ps(pBuffer, attrib[vid]);
145 pBuffer += 4;
146 }
147 }
148 else
149 {
150 pa.AssembleSingle(inputSlot, triIndex, attrib);
151
152 for (uint32_t i = 0; i < NumVertsT::value; ++i)
153 {
154 SIMD128::store_ps(pBuffer, attrib[i]);
155 pBuffer += 4;
156 }
157 }
158 }
159 else
160 {
161 pa.AssembleSingle(inputSlot, triIndex, attrib);
162
163 for (uint32_t i = 0; i < NumVertsT::value; ++i)
164 {
165 SIMD128::store_ps(pBuffer, attrib[i]);
166 pBuffer += 4;
167 }
168 }
169
170 // pad out the attrib buffer to 3 verts to ensure the triangle
171 // interpolation code in the pixel shader works correctly for the
172 // 3 topologies - point, line, tri. This effectively zeros out the
173 // effect of the missing vertices in the triangle interpolation.
174 for (uint32_t v = NumVertsT::value; v < 3; ++v)
175 {
176 SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
177 pBuffer += 4;
178 }
179
180 // check for constant source overrides
181 if (IsSwizzledT::value)
182 {
183 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
184 if (mask)
185 {
186 DWORD comp;
187 while (_BitScanForward(&comp, mask))
188 {
189 mask &= ~(1 << comp);
190
191 float constantValue = 0.0f;
192 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
193 {
194 case SWR_CONSTANT_SOURCE_CONST_0000:
195 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
196 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
197 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
198 break;
199 case SWR_CONSTANT_SOURCE_PRIM_ID:
200 constantValue = *(float*)&primId;
201 break;
202 }
203
204 // apply constant value to all 3 vertices
205 for (uint32_t v = 0; v < 3; ++v)
206 {
207 pAttribStart[comp + v * 4] = constantValue;
208 }
209 }
210 }
211 }
212 }
213 }
214
215 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
216
217 struct ProcessAttributesChooser
218 {
219 typedef PFN_PROCESS_ATTRIBUTES FuncType;
220
221 template <typename... ArgsB>
222 static FuncType GetFunc()
223 {
224 return ProcessAttributes<ArgsB...>;
225 }
226 };
227
228 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
229 {
230 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
231 }
232
233 //////////////////////////////////////////////////////////////////////////
234 /// @brief Processes enabled user clip distances. Loads the active clip
235 /// distances from the PA, sets up barycentric equations, and
236 /// stores the results to the output buffer
237 /// @param pa - Primitive Assembly state
238 /// @param primIndex - primitive index to process
239 /// @param clipDistMask - mask of enabled clip distances
240 /// @param pUserClipBuffer - buffer to store results
241 template<uint32_t NumVerts>
242 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
243 {
244 DWORD clipDist;
245 uint32_t clipDistMask = state.clipDistanceMask;
246 while (_BitScanForward(&clipDist, clipDistMask))
247 {
248 clipDistMask &= ~(1 << clipDist);
249 uint32_t clipSlot = clipDist >> 2;
250 uint32_t clipComp = clipDist & 0x3;
251 uint32_t clipAttribSlot = clipSlot == 0 ?
252 state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
253
254 simd4scalar primClipDist[3];
255 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
256
257 float vertClipDist[NumVerts];
258 for (uint32_t e = 0; e < NumVerts; ++e)
259 {
260 OSALIGNSIMD(float) aVertClipDist[4];
261 SIMD128::store_ps(aVertClipDist, primClipDist[e]);
262 vertClipDist[e] = aVertClipDist[clipComp];
263 };
264
265 // setup plane equations for barycentric interpolation in the backend
266 float baryCoeff[NumVerts];
267 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
268 for (uint32_t e = 0; e < NumVerts - 1; ++e)
269 {
270 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
271 }
272 baryCoeff[NumVerts - 1] = last;
273
274 for (uint32_t e = 0; e < NumVerts; ++e)
275 {
276 *(pUserClipBuffer++) = baryCoeff[e];
277 }
278 }
279 }
280
281 INLINE
282 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
283 {
284 vTranspose3x8(dst, src0, src1, src2);
285 }
286
287 INLINE
288 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
289 {
290 vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
291 }
292
293 //////////////////////////////////////////////////////////////////////////
294 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
295 /// culling, viewport transform, etc.
296 /// @param pDC - pointer to draw context.
297 /// @param pa - The primitive assembly object.
298 /// @param workerId - thread's worker id. Even thread has a unique id.
299 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
300 /// @param primID - Primitive ID for each triangle.
301 /// @param viewportIdx - viewport array index for each triangle.
302 /// @tparam CT - ConservativeRastFETraits
303 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
304 void SIMDCALL BinTrianglesImpl(
305 DRAW_CONTEXT *pDC,
306 PA_STATE &pa,
307 uint32_t workerId,
308 typename SIMD_T::Vec4 tri[3],
309 uint32_t triMask,
310 typename SIMD_T::Integer const &primID)
311 {
312 SWR_CONTEXT *pContext = pDC->pContext;
313
314 AR_BEGIN(FEBinTriangles, pDC->drawId);
315
316 const API_STATE& state = GetApiState(pDC);
317 const SWR_RASTSTATE& rastState = state.rastState;
318 const SWR_FRONTEND_STATE& feState = state.frontendState;
319
320 MacroTileMgr *pTileMgr = pDC->pTileMgr;
321
322 typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
323 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
324 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
325
326 typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
327 typename SIMD_T::Vec4 vpiAttrib[3];
328 typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
329
330 if (state.backendState.readViewportArrayIndex)
331 {
332 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
333
334 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
335 }
336
337
338 if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
339 {
340 // OOB indices => forced to zero.
341 vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
342 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
343 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
344 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
345 }
346 else
347 {
348 viewportIdx = vpai;
349 }
350
351 if (feState.vpTransformDisable)
352 {
353 // RHW is passed in directly when VP transform is disabled
354 vRecipW0 = tri[0].v[3];
355 vRecipW1 = tri[1].v[3];
356 vRecipW2 = tri[2].v[3];
357 }
358 else
359 {
360 // Perspective divide
361 vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
362 vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
363 vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
364
365 tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
366 tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
367 tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
368
369 tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
370 tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
371 tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
372
373 tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
374 tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
375 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
376
377 // Viewport transform to screen space coords
378 if (state.backendState.readViewportArrayIndex)
379 {
380 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
381 }
382 else
383 {
384 viewportTransform<3>(tri, state.vpMatrices);
385 }
386 }
387
388 // Adjust for pixel center location
389 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
390
391 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
392 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
393
394 tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
395 tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
396
397 tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
398 tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
399
400 // Set vXi, vYi to required fixed point precision
401 typename SIMD_T::Integer vXi[3], vYi[3];
402 FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
403
404 // triangle setup
405 typename SIMD_T::Integer vAi[3], vBi[3];
406 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
407
408 // determinant
409 typename SIMD_T::Integer vDet[2];
410 calcDeterminantIntVertical(vAi, vBi, vDet);
411
412 // cull zero area
413 uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
414 uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
415
416 uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
417
418 // don't cull degenerate triangles if we're conservatively rasterizing
419 uint32_t origTriMask = triMask;
420 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
421 {
422 triMask &= ~cullZeroAreaMask;
423 }
424
425 // determine front winding tris
426 // CW +det
427 // CCW det < 0;
428 // 0 area triangles are marked as backfacing regardless of winding order,
429 // which is required behavior for conservative rast and wireframe rendering
430 uint32_t frontWindingTris;
431 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
432 {
433 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
434 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
435 }
436 else
437 {
438 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
439 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
440 }
441 frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
442
443 // cull
444 uint32_t cullTris;
445 switch ((SWR_CULLMODE)rastState.cullMode)
446 {
447 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
448 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
449 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
450 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
451 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
452 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
453 }
454
455 triMask &= ~cullTris;
456
457 if (origTriMask ^ triMask)
458 {
459 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
460 }
461
462 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
463 // compute per tri backface
464 uint32_t frontFaceMask = frontWindingTris;
465 uint32_t *pPrimID = (uint32_t *)&primID;
466 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
467 DWORD triIndex = 0;
468
469 uint32_t edgeEnable;
470 PFN_WORK_FUNC pfnWork;
471 if (CT::IsConservativeT::value)
472 {
473 // determine which edges of the degenerate tri, if any, are valid to rasterize.
474 // used to call the appropriate templated rasterizer function
475 if (cullZeroAreaMask > 0)
476 {
477 // e0 = v1-v0
478 const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
479 const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
480
481 uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
482
483 // e1 = v2-v1
484 const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
485 const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
486
487 uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
488
489 // e2 = v0-v2
490 // if v0 == v1 & v1 == v2, v0 == v2
491 uint32_t e2Mask = e0Mask & e1Mask;
492 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
493
494 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
495 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
496 e0Mask = pdep_u32(e0Mask, 0x00249249);
497
498 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
499 e1Mask = pdep_u32(e1Mask, 0x00492492);
500
501 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
502 e2Mask = pdep_u32(e2Mask, 0x00924924);
503
504 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
505 }
506 else
507 {
508 edgeEnable = 0x00FFFFFF;
509 }
510 }
511 else
512 {
513 // degenerate triangles won't be sent to rasterizer; just enable all edges
514 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
515 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
516 }
517
518 SIMDBBOX_T<SIMD_T> bbox;
519
520 if (!triMask)
521 {
522 goto endBinTriangles;
523 }
524
525 // Calc bounding box of triangles
526 calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
527
528 // determine if triangle falls between pixel centers and discard
529 // only discard for non-MSAA case and when conservative rast is disabled
530 // (xmin + 127) & ~255
531 // (xmax + 128) & ~255
532 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
533 (!CT::IsConservativeT::value))
534 {
535 origTriMask = triMask;
536
537 int cullCenterMask;
538
539 {
540 typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
541 xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
542 typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
543 xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
544
545 typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
546
547 typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
548 ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
549 typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
550 ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
551
552 typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
553
554 vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
555 cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
556 }
557
558 triMask &= ~cullCenterMask;
559
560 if (origTriMask ^ triMask)
561 {
562 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
563 }
564 }
565
566 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
567 // Gather the AOS effective scissor rects based on the per-prim VP index.
568 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
569 {
570 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
571
572 if (state.backendState.readViewportArrayIndex)
573 {
574 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
575 }
576 else // broadcast fast path for non-VPAI case.
577 {
578 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
579 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
580 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
581 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
582 }
583
584 // Make triangle bbox inclusive
585 bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
586 bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
587
588 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
589 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
590 bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
591 bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
592 }
593
594 if (CT::IsConservativeT::value)
595 {
596 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
597 // some area. Bump the xmax/ymax edges out
598
599 typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
600 bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
601
602 typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
603 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
604 }
605
606 // Cull tris completely outside scissor
607 {
608 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
609 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
610 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
611 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
612 triMask = triMask & ~maskOutsideScissor;
613 }
614
615 endBinTriangles:
616
617
618 // Send surviving triangles to the line or point binner based on fill mode
619 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
620 {
621 // Simple non-conformant wireframe mode, useful for debugging
622 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
623 typename SIMD_T::Vec4 line[2];
624 typename SIMD_T::Float recipW[2];
625
626 line[0] = tri[0];
627 line[1] = tri[1];
628 recipW[0] = vRecipW0;
629 recipW[1] = vRecipW1;
630
631 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
632
633 line[0] = tri[1];
634 line[1] = tri[2];
635 recipW[0] = vRecipW1;
636 recipW[1] = vRecipW2;
637
638 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
639
640 line[0] = tri[2];
641 line[1] = tri[0];
642 recipW[0] = vRecipW2;
643 recipW[1] = vRecipW0;
644
645 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
646
647 AR_END(FEBinTriangles, 1);
648 return;
649 }
650 else if (rastState.fillMode == SWR_FILLMODE_POINT)
651 {
652 // Bin 3 points
653 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
654 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
655 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
656
657 AR_END(FEBinTriangles, 1);
658 return;
659 }
660
661 // Convert triangle bbox to macrotile units.
662 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
663 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
664 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
665 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
666
667 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
668
669 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
670 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
671 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
672 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
673
674 // transpose verts needed for backend
675 /// @todo modify BE to take non-transformed verts
676 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
677 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
678 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
679 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
680
681 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
682 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
683 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
684 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
685
686 // store render target array index
687 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
688 if (state.backendState.readRenderTargetArrayIndex)
689 {
690 typename SIMD_T::Vec4 vRtai[3];
691 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
692 typename SIMD_T::Integer vRtaii;
693 vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
694 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
695 }
696 else
697 {
698 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
699 }
700
701
702 // scan remaining valid triangles and bin each separately
703 while (_BitScanForward(&triIndex, triMask))
704 {
705 uint32_t linkageCount = state.backendState.numAttributes;
706 uint32_t numScalarAttribs = linkageCount * 4;
707
708 BE_WORK work;
709 work.type = DRAW;
710
711 bool isDegenerate;
712 if (CT::IsConservativeT::value)
713 {
714 // only rasterize valid edges if we have a degenerate primitive
715 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
716 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
717 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
718
719 // Degenerate triangles are required to be constant interpolated
720 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
721 }
722 else
723 {
724 isDegenerate = false;
725 work.pfnWork = pfnWork;
726 }
727
728 // Select attribute processor
729 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
730 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
731
732 TRIANGLE_WORK_DESC &desc = work.desc.tri;
733
734 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
735 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
736 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
737
738 auto pArena = pDC->pArena;
739 SWR_ASSERT(pArena != nullptr);
740
741 // store active attribs
742 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
743 desc.pAttribs = pAttribs;
744 desc.numAttribs = linkageCount;
745 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
746
747 // store triangle vertex data
748 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
749
750 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
751 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
752 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
753 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
754
755 // store user clip distances
756 if (state.backendState.clipDistanceMask)
757 {
758 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
759 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
760 ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
761 }
762
763 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
764 {
765 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
766 {
767 #if KNOB_ENABLE_TOSS_POINTS
768 if (!KNOB_TOSS_SETUP_TRIS)
769 #endif
770 {
771 pTileMgr->enqueue(x, y, &work);
772 }
773 }
774 }
775
776 triMask &= ~(1 << triIndex);
777 }
778
779 AR_END(FEBinTriangles, 1);
780 }
781
782 template <typename CT>
783 void BinTriangles(
784 DRAW_CONTEXT *pDC,
785 PA_STATE &pa,
786 uint32_t workerId,
787 simdvector tri[3],
788 uint32_t triMask,
789 simdscalari const &primID)
790 {
791 BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
792 }
793
794 #if USE_SIMD16_FRONTEND
795 template <typename CT>
796 void SIMDCALL BinTriangles_simd16(
797 DRAW_CONTEXT *pDC,
798 PA_STATE &pa,
799 uint32_t workerId,
800 simd16vector tri[3],
801 uint32_t triMask,
802 simd16scalari const &primID)
803 {
804 BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
805 }
806
807 #endif
808 struct FEBinTrianglesChooser
809 {
810 typedef PFN_PROCESS_PRIMS FuncType;
811
812 template <typename... ArgsB>
813 static FuncType GetFunc()
814 {
815 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
816 }
817 };
818
819 // Selector for correct templated BinTrinagles function
820 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
821 {
822 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
823 }
824
825 #if USE_SIMD16_FRONTEND
826 struct FEBinTrianglesChooser_simd16
827 {
828 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
829
830 template <typename... ArgsB>
831 static FuncType GetFunc()
832 {
833 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
834 }
835 };
836
837 // Selector for correct templated BinTrinagles function
838 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
839 {
840 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
841 }
842
843 #endif
844
845 template <typename SIMD_T, uint32_t SIMD_WIDTH>
846 void BinPostSetupPointsImpl(
847 DRAW_CONTEXT *pDC,
848 PA_STATE &pa,
849 uint32_t workerId,
850 typename SIMD_T::Vec4 prim[],
851 uint32_t primMask,
852 typename SIMD_T::Integer const &primID,
853 typename SIMD_T::Integer const &viewportIdx)
854 {
855 SWR_CONTEXT *pContext = pDC->pContext;
856
857 AR_BEGIN(FEBinPoints, pDC->drawId);
858
859 typename SIMD_T::Vec4 &primVerts = prim[0];
860
861 const API_STATE& state = GetApiState(pDC);
862 const SWR_RASTSTATE& rastState = state.rastState;
863 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
864
865 // Select attribute processor
866 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
867 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
868
869 // convert to fixed point
870 typename SIMD_T::Integer vXi, vYi;
871
872 vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
873 vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
874
875 if (CanUseSimplePoints(pDC))
876 {
877 // adjust for ymin-xmin rule
878 vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
879 vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
880
881 // cull points off the ymin-xmin edge of the viewport
882 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
883 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
884
885 // compute macro tile coordinates
886 typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
887 typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
888
889 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
890
891 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
892 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
893
894 // compute raster tile coordinates
895 typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
896 typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
897
898 // compute raster tile relative x,y for coverage mask
899 typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
900 typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
901
902 typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
903 typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
904
905 OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
906 OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
907
908 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
909 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
910
911 OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
912 OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
913
914 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
915 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
916
917 OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
918 SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
919
920 // store render target array index
921 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
922 if (state.backendState.readRenderTargetArrayIndex)
923 {
924 typename SIMD_T::Vec4 vRtai;
925 pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
926 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
927 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
928 }
929 else
930 {
931 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
932 }
933
934 uint32_t *pPrimID = (uint32_t *)&primID;
935 DWORD primIndex = 0;
936
937 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
938
939 // scan remaining valid triangles and bin each separately
940 while (_BitScanForward(&primIndex, primMask))
941 {
942 uint32_t linkageCount = backendState.numAttributes;
943 uint32_t numScalarAttribs = linkageCount * 4;
944
945 BE_WORK work;
946 work.type = DRAW;
947
948 TRIANGLE_WORK_DESC &desc = work.desc.tri;
949
950 // points are always front facing
951 desc.triFlags.frontFacing = 1;
952 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
953 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
954
955 work.pfnWork = RasterizeSimplePoint;
956
957 auto pArena = pDC->pArena;
958 SWR_ASSERT(pArena != nullptr);
959
960 // store attributes
961 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
962 desc.pAttribs = pAttribs;
963 desc.numAttribs = linkageCount;
964
965 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
966
967 // store raster tile aligned x, y, perspective correct z
968 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
969 desc.pTriBuffer = pTriBuffer;
970 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
971 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
972 *pTriBuffer = aZ[primIndex];
973
974 uint32_t tX = aTileRelativeX[primIndex];
975 uint32_t tY = aTileRelativeY[primIndex];
976
977 // pack the relative x,y into the coverageMask, the rasterizer will
978 // generate the true coverage mask from it
979 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
980
981 // bin it
982 MacroTileMgr *pTileMgr = pDC->pTileMgr;
983 #if KNOB_ENABLE_TOSS_POINTS
984 if (!KNOB_TOSS_SETUP_TRIS)
985 #endif
986 {
987 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
988 }
989
990 primMask &= ~(1 << primIndex);
991 }
992 }
993 else
994 {
995 // non simple points need to be potentially binned to multiple macro tiles
996 typename SIMD_T::Float vPointSize;
997
998 if (rastState.pointParam)
999 {
1000 typename SIMD_T::Vec4 size[3];
1001 pa.Assemble(VERTEX_SGV_SLOT, size);
1002 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1003 }
1004 else
1005 {
1006 vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1007 }
1008
1009 // bloat point to bbox
1010 SIMDBBOX_T<SIMD_T> bbox;
1011
1012 bbox.xmin = bbox.xmax = vXi;
1013 bbox.ymin = bbox.ymax = vYi;
1014
1015 typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1016 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1017
1018 bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1019 bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1020 bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1021 bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1022
1023 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1024 // Gather the AOS effective scissor rects based on the per-prim VP index.
1025 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1026 {
1027 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1028
1029 if (state.backendState.readViewportArrayIndex)
1030 {
1031 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1032 }
1033 else // broadcast fast path for non-VPAI case.
1034 {
1035 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1036 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1037 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1038 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1039 }
1040
1041 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1042 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1043 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1044 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1045 }
1046
1047 // Cull bloated points completely outside scissor
1048 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1049 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1050 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1051 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1052 primMask = primMask & ~maskOutsideScissor;
1053
1054 // Convert bbox to macrotile units.
1055 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1056 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1057 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1058 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1059
1060 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1061
1062 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1063 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1064 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1065 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1066
1067 // store render target array index
1068 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1069 if (state.backendState.readRenderTargetArrayIndex)
1070 {
1071 typename SIMD_T::Vec4 vRtai[2];
1072 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1073 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1074 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1075 }
1076 else
1077 {
1078 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1079 }
1080
1081 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1082 SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1083
1084 uint32_t *pPrimID = (uint32_t *)&primID;
1085
1086 OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1087 OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1088 OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1089
1090 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1091 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1092 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1093
1094 // scan remaining valid prims and bin each separately
1095 const SWR_BACKEND_STATE& backendState = state.backendState;
1096 DWORD primIndex;
1097 while (_BitScanForward(&primIndex, primMask))
1098 {
1099 uint32_t linkageCount = backendState.numAttributes;
1100 uint32_t numScalarAttribs = linkageCount * 4;
1101
1102 BE_WORK work;
1103 work.type = DRAW;
1104
1105 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1106
1107 desc.triFlags.frontFacing = 1;
1108 desc.triFlags.pointSize = aPointSize[primIndex];
1109 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1110 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1111
1112 work.pfnWork = RasterizeTriPoint;
1113
1114 auto pArena = pDC->pArena;
1115 SWR_ASSERT(pArena != nullptr);
1116
1117 // store active attribs
1118 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1119 desc.numAttribs = linkageCount;
1120 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1121
1122 // store point vertex data
1123 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1124 desc.pTriBuffer = pTriBuffer;
1125 *pTriBuffer++ = aPrimVertsX[primIndex];
1126 *pTriBuffer++ = aPrimVertsY[primIndex];
1127 *pTriBuffer = aPrimVertsZ[primIndex];
1128
1129 // store user clip distances
1130 if (backendState.clipDistanceMask)
1131 {
1132 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1133 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1134 float dists[8];
1135 float one = 1.0f;
1136 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1137 for (uint32_t i = 0; i < numClipDist; i++) {
1138 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1139 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1140 desc.pUserClipBuffer[3 * i + 2] = dists[i];
1141 }
1142 }
1143
1144 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1145 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1146 {
1147 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1148 {
1149 #if KNOB_ENABLE_TOSS_POINTS
1150 if (!KNOB_TOSS_SETUP_TRIS)
1151 #endif
1152 {
1153 pTileMgr->enqueue(x, y, &work);
1154 }
1155 }
1156 }
1157
1158 primMask &= ~(1 << primIndex);
1159 }
1160 }
1161
1162 AR_END(FEBinPoints, 1);
1163 }
1164
1165 //////////////////////////////////////////////////////////////////////////
1166 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1167 /// @param pDC - pointer to draw context.
1168 /// @param pa - The primitive assembly object.
1169 /// @param workerId - thread's worker id. Even thread has a unique id.
1170 /// @param tri - Contains point position data for SIMDs worth of points.
1171 /// @param primID - Primitive ID for each point.
1172 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1173 void BinPointsImpl(
1174 DRAW_CONTEXT *pDC,
1175 PA_STATE &pa,
1176 uint32_t workerId,
1177 typename SIMD_T::Vec4 prim[3],
1178 uint32_t primMask,
1179 typename SIMD_T::Integer const &primID)
1180 {
1181 const API_STATE& state = GetApiState(pDC);
1182 const SWR_FRONTEND_STATE& feState = state.frontendState;
1183 const SWR_RASTSTATE& rastState = state.rastState;
1184
1185 // Read back viewport index if required
1186 typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1187 typename SIMD_T::Vec4 vpiAttrib[1];
1188 typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1189
1190 if (state.backendState.readViewportArrayIndex)
1191 {
1192 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1193
1194 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1195 }
1196
1197
1198 if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1199 {
1200 // OOB indices => forced to zero.
1201 vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1202 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1203 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1204 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1205 }
1206 else
1207 {
1208 viewportIdx = vpai;
1209 }
1210
1211 if (!feState.vpTransformDisable)
1212 {
1213 // perspective divide
1214 typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1215
1216 prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1217 prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1218 prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1219
1220 // viewport transform to screen coords
1221 if (state.backendState.readViewportArrayIndex)
1222 {
1223 viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1224 }
1225 else
1226 {
1227 viewportTransform<1>(prim, state.vpMatrices);
1228 }
1229 }
1230
1231 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1232
1233 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1234 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1235
1236 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1237 pDC,
1238 pa,
1239 workerId,
1240 prim,
1241 primMask,
1242 primID,
1243 viewportIdx);
1244 }
1245
1246 void BinPoints(
1247 DRAW_CONTEXT *pDC,
1248 PA_STATE &pa,
1249 uint32_t workerId,
1250 simdvector prim[3],
1251 uint32_t primMask,
1252 simdscalari const &primID)
1253 {
1254 BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1255 pDC,
1256 pa,
1257 workerId,
1258 prim,
1259 primMask,
1260 primID);
1261 }
1262
1263 #if USE_SIMD16_FRONTEND
1264 void SIMDCALL BinPoints_simd16(
1265 DRAW_CONTEXT *pDC,
1266 PA_STATE &pa,
1267 uint32_t workerId,
1268 simd16vector prim[3],
1269 uint32_t primMask,
1270 simd16scalari const &primID)
1271 {
1272 BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1273 pDC,
1274 pa,
1275 workerId,
1276 prim,
1277 primMask,
1278 primID);
1279 }
1280
1281 #endif
1282 //////////////////////////////////////////////////////////////////////////
1283 /// @brief Bin SIMD lines to the backend.
1284 /// @param pDC - pointer to draw context.
1285 /// @param pa - The primitive assembly object.
1286 /// @param workerId - thread's worker id. Even thread has a unique id.
1287 /// @param tri - Contains line position data for SIMDs worth of points.
1288 /// @param primID - Primitive ID for each line.
1289 /// @param viewportIdx - Viewport Array Index for each line.
1290 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1291 void BinPostSetupLinesImpl(
1292 DRAW_CONTEXT *pDC,
1293 PA_STATE &pa,
1294 uint32_t workerId,
1295 typename SIMD_T::Vec4 prim[],
1296 typename SIMD_T::Float recipW[],
1297 uint32_t primMask,
1298 typename SIMD_T::Integer const &primID,
1299 typename SIMD_T::Integer const &viewportIdx)
1300 {
1301 SWR_CONTEXT *pContext = pDC->pContext;
1302
1303 AR_BEGIN(FEBinLines, pDC->drawId);
1304
1305 const API_STATE &state = GetApiState(pDC);
1306 const SWR_RASTSTATE &rastState = state.rastState;
1307
1308 // Select attribute processor
1309 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1310 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1311
1312 typename SIMD_T::Float &vRecipW0 = recipW[0];
1313 typename SIMD_T::Float &vRecipW1 = recipW[1];
1314
1315 // convert to fixed point
1316 typename SIMD_T::Integer vXi[2], vYi[2];
1317
1318 vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1319 vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1320 vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1321 vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1322
1323 // compute x-major vs y-major mask
1324 typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1325 typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1326 typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1327 uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1328
1329 // cull zero-length lines
1330 typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1331 vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1332
1333 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1334
1335 uint32_t *pPrimID = (uint32_t *)&primID;
1336 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1337
1338 // Calc bounding box of lines
1339 SIMDBBOX_T<SIMD_T> bbox;
1340 bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1341 bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1342 bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1343 bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1344
1345 // bloat bbox by line width along minor axis
1346 typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1347 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1348
1349 SIMDBBOX_T<SIMD_T> bloatBox;
1350
1351 bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1352 bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1353 bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1354 bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1355
1356 bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1357 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1358 bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1359 bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1360
1361 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1362 {
1363 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1364
1365 if (state.backendState.readViewportArrayIndex)
1366 {
1367 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1368 }
1369 else // broadcast fast path for non-VPAI case.
1370 {
1371 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1372 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1373 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1374 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1375 }
1376
1377 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1378 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1379 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1380 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1381 }
1382
1383 // Cull prims completely outside scissor
1384 {
1385 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1386 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1387 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1388 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1389 primMask = primMask & ~maskOutsideScissor;
1390 }
1391
1392 // transpose verts needed for backend
1393 /// @todo modify BE to take non-transformed verts
1394 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1395 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1396 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1397 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1398
1399 if (!primMask)
1400 {
1401 goto endBinLines;
1402 }
1403
1404 // Convert triangle bbox to macrotile units.
1405 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1406 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1407 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1408 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1409
1410 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1411
1412 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1413 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1414 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1415 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1416
1417 TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1418 TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1419 TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1420 TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
1421
1422 // store render target array index
1423 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1424 if (state.backendState.readRenderTargetArrayIndex)
1425 {
1426 typename SIMD_T::Vec4 vRtai[2];
1427 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1428 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1429 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1430 }
1431 else
1432 {
1433 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1434 }
1435
1436 // scan remaining valid prims and bin each separately
1437 DWORD primIndex;
1438 while (_BitScanForward(&primIndex, primMask))
1439 {
1440 uint32_t linkageCount = state.backendState.numAttributes;
1441 uint32_t numScalarAttribs = linkageCount * 4;
1442
1443 BE_WORK work;
1444 work.type = DRAW;
1445
1446 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1447
1448 desc.triFlags.frontFacing = 1;
1449 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1450 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1451 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1452
1453 work.pfnWork = RasterizeLine;
1454
1455 auto pArena = pDC->pArena;
1456 SWR_ASSERT(pArena != nullptr);
1457
1458 // store active attribs
1459 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1460 desc.numAttribs = linkageCount;
1461 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1462
1463 // store line vertex data
1464 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1465
1466 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1467 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1468 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1469 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1470
1471 // store user clip distances
1472 if (state.backendState.clipDistanceMask)
1473 {
1474 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1475 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1476 ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1477 }
1478
1479 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1480 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1481 {
1482 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1483 {
1484 #if KNOB_ENABLE_TOSS_POINTS
1485 if (!KNOB_TOSS_SETUP_TRIS)
1486 #endif
1487 {
1488 pTileMgr->enqueue(x, y, &work);
1489 }
1490 }
1491 }
1492
1493 primMask &= ~(1 << primIndex);
1494 }
1495
1496 endBinLines:
1497
1498 AR_END(FEBinLines, 1);
1499 }
1500
1501 //////////////////////////////////////////////////////////////////////////
1502 /// @brief Bin SIMD lines to the backend.
1503 /// @param pDC - pointer to draw context.
1504 /// @param pa - The primitive assembly object.
1505 /// @param workerId - thread's worker id. Even thread has a unique id.
1506 /// @param tri - Contains line position data for SIMDs worth of points.
1507 /// @param primID - Primitive ID for each line.
1508 /// @param viewportIdx - Viewport Array Index for each line.
1509 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1510 void SIMDCALL BinLinesImpl(
1511 DRAW_CONTEXT *pDC,
1512 PA_STATE &pa,
1513 uint32_t workerId,
1514 typename SIMD_T::Vec4 prim[3],
1515 uint32_t primMask,
1516 typename SIMD_T::Integer const &primID)
1517 {
1518 const API_STATE& state = GetApiState(pDC);
1519 const SWR_RASTSTATE& rastState = state.rastState;
1520 const SWR_FRONTEND_STATE& feState = state.frontendState;
1521
1522 typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1523
1524 typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1525 typename SIMD_T::Vec4 vpiAttrib[2];
1526 typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1527
1528 if (state.backendState.readViewportArrayIndex)
1529 {
1530 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1531 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1532 }
1533
1534
1535 if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1536 {
1537 // OOB indices => forced to zero.
1538 vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1539 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1540 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1541 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1542 }
1543
1544 if (!feState.vpTransformDisable)
1545 {
1546 // perspective divide
1547 vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1548 vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1549
1550 prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1551 prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1552
1553 prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1554 prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1555
1556 prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1557 prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1558
1559 // viewport transform to screen coords
1560 if (state.backendState.readViewportArrayIndex)
1561 {
1562 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1563 }
1564 else
1565 {
1566 viewportTransform<2>(prim, state.vpMatrices);
1567 }
1568 }
1569
1570 // adjust for pixel center location
1571 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1572
1573 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1574 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1575
1576 prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1577 prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1578
1579 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1580 pDC,
1581 pa,
1582 workerId,
1583 prim,
1584 vRecipW,
1585 primMask,
1586 primID,
1587 viewportIdx);
1588 }
1589
1590 void BinLines(
1591 DRAW_CONTEXT *pDC,
1592 PA_STATE &pa,
1593 uint32_t workerId,
1594 simdvector prim[],
1595 uint32_t primMask,
1596 simdscalari const &primID)
1597 {
1598 BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1599 }
1600
1601 #if USE_SIMD16_FRONTEND
1602 void SIMDCALL BinLines_simd16(
1603 DRAW_CONTEXT *pDC,
1604 PA_STATE &pa,
1605 uint32_t workerId,
1606 simd16vector prim[3],
1607 uint32_t primMask,
1608 simd16scalari const &primID)
1609 {
1610 BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1611 }
1612
1613 #endif