swr/rast: Start to remove hardcoded clipcull_dist vertex attrib slot
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(
41 DRAW_CONTEXT *pDC,
42 PA_STATE &pa,
43 uint32_t workerId,
44 typename SIMD_T::Vec4 prim[],
45 typename SIMD_T::Float recipW[],
46 uint32_t primMask,
47 typename SIMD_T::Integer const &primID,
48 typename SIMD_T::Integer const &viewportIdx);
49
50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
51 void BinPostSetupPointsImpl(
52 DRAW_CONTEXT *pDC,
53 PA_STATE &pa,
54 uint32_t workerId,
55 typename SIMD_T::Vec4 prim[],
56 uint32_t primMask,
57 typename SIMD_T::Integer const &primID,
58 typename SIMD_T::Integer const &viewportIdx);
59
60 //////////////////////////////////////////////////////////////////////////
61 /// @brief Processes attributes for the backend based on linkage mask and
62 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
63 /// @param pDC - Draw context
64 /// @param pa - Primitive Assembly state
65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
66 /// @param pLinkageMap - maps VS attribute slot to PS slot
67 /// @param triIndex - Triangle to process attributes for
68 /// @param pBuffer - Output result
69 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
70 INLINE void ProcessAttributes(
71 DRAW_CONTEXT *pDC,
72 PA_STATE&pa,
73 uint32_t triIndex,
74 uint32_t primId,
75 float *pBuffer)
76 {
77 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
78 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
79 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
80 uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
81 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
82 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
83
84 static const float constTable[3][4] = {
85 { 0.0f, 0.0f, 0.0f, 0.0f },
86 { 0.0f, 0.0f, 0.0f, 1.0f },
87 { 1.0f, 1.0f, 1.0f, 1.0f }
88 };
89
90 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
91 {
92 uint32_t inputSlot;
93 if (IsSwizzledT::value)
94 {
95 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
96 inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
97
98 }
99 else
100 {
101 inputSlot = backendState.vertexAttribOffset + i;
102 }
103
104 simd4scalar attrib[3]; // triangle attribs (always 4 wide)
105 float* pAttribStart = pBuffer;
106
107 if (HasConstantInterpT::value || IsDegenerate::value)
108 {
109 if (CheckBit(constantInterpMask, i))
110 {
111 uint32_t vid;
112 uint32_t adjustedTriIndex;
113 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
114 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
115 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
116 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
117 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
118
119 switch (topo) {
120 case TOP_QUAD_LIST:
121 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
122 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
123 break;
124 case TOP_QUAD_STRIP:
125 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
126 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
127 break;
128 case TOP_TRIANGLE_STRIP:
129 adjustedTriIndex = triIndex;
130 vid = (triIndex & 1)
131 ? tristripProvokingVertex[provokingVertex]
132 : provokingVertex;
133 break;
134 default:
135 adjustedTriIndex = triIndex;
136 vid = provokingVertex;
137 break;
138 }
139
140 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
141
142 for (uint32_t i = 0; i < NumVertsT::value; ++i)
143 {
144 SIMD128::store_ps(pBuffer, attrib[vid]);
145 pBuffer += 4;
146 }
147 }
148 else
149 {
150 pa.AssembleSingle(inputSlot, triIndex, attrib);
151
152 for (uint32_t i = 0; i < NumVertsT::value; ++i)
153 {
154 SIMD128::store_ps(pBuffer, attrib[i]);
155 pBuffer += 4;
156 }
157 }
158 }
159 else
160 {
161 pa.AssembleSingle(inputSlot, triIndex, attrib);
162
163 for (uint32_t i = 0; i < NumVertsT::value; ++i)
164 {
165 SIMD128::store_ps(pBuffer, attrib[i]);
166 pBuffer += 4;
167 }
168 }
169
170 // pad out the attrib buffer to 3 verts to ensure the triangle
171 // interpolation code in the pixel shader works correctly for the
172 // 3 topologies - point, line, tri. This effectively zeros out the
173 // effect of the missing vertices in the triangle interpolation.
174 for (uint32_t v = NumVertsT::value; v < 3; ++v)
175 {
176 SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
177 pBuffer += 4;
178 }
179
180 // check for constant source overrides
181 if (IsSwizzledT::value)
182 {
183 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
184 if (mask)
185 {
186 DWORD comp;
187 while (_BitScanForward(&comp, mask))
188 {
189 mask &= ~(1 << comp);
190
191 float constantValue = 0.0f;
192 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
193 {
194 case SWR_CONSTANT_SOURCE_CONST_0000:
195 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
196 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
197 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
198 break;
199 case SWR_CONSTANT_SOURCE_PRIM_ID:
200 constantValue = *(float*)&primId;
201 break;
202 }
203
204 // apply constant value to all 3 vertices
205 for (uint32_t v = 0; v < 3; ++v)
206 {
207 pAttribStart[comp + v * 4] = constantValue;
208 }
209 }
210 }
211 }
212 }
213 }
214
215 //////////////////////////////////////////////////////////////////////////
216 /// @brief Gather scissor rect data based on per-prim viewport indices.
217 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
218 /// @param pViewportIndex - array of per-primitive vewport indexes.
219 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
220 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
221 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
222 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
223 //
224 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
225 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
226 simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax)
227 {
228 scisXmin = _simd_set_epi32(
229 pScissorsInFixedPoint[pViewportIndex[0]].xmin,
230 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
231 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
232 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
233 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
234 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
235 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
236 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
237 scisYmin = _simd_set_epi32(
238 pScissorsInFixedPoint[pViewportIndex[0]].ymin,
239 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
240 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
241 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
242 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
243 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
244 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
245 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
246 scisXmax = _simd_set_epi32(
247 pScissorsInFixedPoint[pViewportIndex[0]].xmax,
248 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
249 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
250 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
251 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
252 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
253 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
254 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
255 scisYmax = _simd_set_epi32(
256 pScissorsInFixedPoint[pViewportIndex[0]].ymax,
257 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
258 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
259 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
260 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
261 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
262 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
263 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
264 }
265
266 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
267 simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax)
268 {
269 scisXmin = _simd16_set_epi32(
270 pScissorsInFixedPoint[pViewportIndex[0]].xmin,
271 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
272 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
273 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
274 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
275 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
276 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
277 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
278 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
279 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
280 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
281 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
282 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
283 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
284 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
285 pScissorsInFixedPoint[pViewportIndex[15]].xmin);
286
287 scisYmin = _simd16_set_epi32(
288 pScissorsInFixedPoint[pViewportIndex[0]].ymin,
289 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
290 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
291 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
292 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
293 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
294 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
295 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
296 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
297 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
298 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
299 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
300 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
301 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
302 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
303 pScissorsInFixedPoint[pViewportIndex[15]].ymin);
304
305 scisXmax = _simd16_set_epi32(
306 pScissorsInFixedPoint[pViewportIndex[0]].xmax,
307 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
308 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
309 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
310 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
311 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
312 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
313 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
314 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
315 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
316 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
317 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
318 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
319 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
320 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
321 pScissorsInFixedPoint[pViewportIndex[15]].xmax);
322
323 scisYmax = _simd16_set_epi32(
324 pScissorsInFixedPoint[pViewportIndex[0]].ymax,
325 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
326 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
327 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
328 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
329 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
330 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
331 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
332 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
333 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
334 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
335 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
336 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
337 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
338 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
339 pScissorsInFixedPoint[pViewportIndex[15]].ymax);
340 }
341
342 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
343
344 struct ProcessAttributesChooser
345 {
346 typedef PFN_PROCESS_ATTRIBUTES FuncType;
347
348 template <typename... ArgsB>
349 static FuncType GetFunc()
350 {
351 return ProcessAttributes<ArgsB...>;
352 }
353 };
354
355 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
356 {
357 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
358 }
359
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief Processes enabled user clip distances. Loads the active clip
362 /// distances from the PA, sets up barycentric equations, and
363 /// stores the results to the output buffer
364 /// @param pa - Primitive Assembly state
365 /// @param primIndex - primitive index to process
366 /// @param clipDistMask - mask of enabled clip distances
367 /// @param pUserClipBuffer - buffer to store results
368 template<uint32_t NumVerts>
369 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
370 {
371 DWORD clipDist;
372 uint32_t clipDistMask = state.clipDistanceMask;
373 while (_BitScanForward(&clipDist, clipDistMask))
374 {
375 clipDistMask &= ~(1 << clipDist);
376 uint32_t clipSlot = clipDist >> 2;
377 uint32_t clipComp = clipDist & 0x3;
378 uint32_t clipAttribSlot = clipSlot == 0 ?
379 state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
380
381 simd4scalar primClipDist[3];
382 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
383
384 float vertClipDist[NumVerts];
385 for (uint32_t e = 0; e < NumVerts; ++e)
386 {
387 OSALIGNSIMD(float) aVertClipDist[4];
388 SIMD128::store_ps(aVertClipDist, primClipDist[e]);
389 vertClipDist[e] = aVertClipDist[clipComp];
390 };
391
392 // setup plane equations for barycentric interpolation in the backend
393 float baryCoeff[NumVerts];
394 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
395 for (uint32_t e = 0; e < NumVerts - 1; ++e)
396 {
397 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
398 }
399 baryCoeff[NumVerts - 1] = last;
400
401 for (uint32_t e = 0; e < NumVerts; ++e)
402 {
403 *(pUserClipBuffer++) = baryCoeff[e];
404 }
405 }
406 }
407
408 INLINE
409 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
410 {
411 vTranspose3x8(dst, src0, src1, src2);
412 }
413
414 INLINE
415 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
416 {
417 vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
418 }
419
420 //////////////////////////////////////////////////////////////////////////
421 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
422 /// culling, viewport transform, etc.
423 /// @param pDC - pointer to draw context.
424 /// @param pa - The primitive assembly object.
425 /// @param workerId - thread's worker id. Even thread has a unique id.
426 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
427 /// @param primID - Primitive ID for each triangle.
428 /// @param viewportIdx - viewport array index for each triangle.
429 /// @tparam CT - ConservativeRastFETraits
430 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
431 void SIMDCALL BinTrianglesImpl(
432 DRAW_CONTEXT *pDC,
433 PA_STATE &pa,
434 uint32_t workerId,
435 typename SIMD_T::Vec4 tri[3],
436 uint32_t triMask,
437 typename SIMD_T::Integer const &primID)
438 {
439 SWR_CONTEXT *pContext = pDC->pContext;
440
441 AR_BEGIN(FEBinTriangles, pDC->drawId);
442
443 const API_STATE& state = GetApiState(pDC);
444 const SWR_RASTSTATE& rastState = state.rastState;
445 const SWR_FRONTEND_STATE& feState = state.frontendState;
446
447 MacroTileMgr *pTileMgr = pDC->pTileMgr;
448
449 typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
450 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
451 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
452
453 typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
454
455 if (state.backendState.readViewportArrayIndex)
456 {
457 typename SIMD_T::Vec4 vpiAttrib[3];
458 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
459
460 // OOB indices => forced to zero.
461 typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
462 vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
463 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
464 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
465 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
466 }
467
468 if (feState.vpTransformDisable)
469 {
470 // RHW is passed in directly when VP transform is disabled
471 vRecipW0 = tri[0].v[3];
472 vRecipW1 = tri[1].v[3];
473 vRecipW2 = tri[2].v[3];
474 }
475 else
476 {
477 // Perspective divide
478 vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
479 vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
480 vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
481
482 tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
483 tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
484 tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
485
486 tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
487 tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
488 tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
489
490 tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
491 tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
492 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
493
494 // Viewport transform to screen space coords
495 if (state.backendState.readViewportArrayIndex)
496 {
497 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
498 }
499 else
500 {
501 viewportTransform<3>(tri, state.vpMatrices);
502 }
503 }
504
505 // Adjust for pixel center location
506 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
507
508 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
509 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
510
511 tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
512 tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
513
514 tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
515 tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
516
517 // Set vXi, vYi to required fixed point precision
518 typename SIMD_T::Integer vXi[3], vYi[3];
519 FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
520
521 // triangle setup
522 typename SIMD_T::Integer vAi[3], vBi[3];
523 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
524
525 // determinant
526 typename SIMD_T::Integer vDet[2];
527 calcDeterminantIntVertical(vAi, vBi, vDet);
528
529 // cull zero area
530 uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
531 uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
532
533 uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
534
535 // don't cull degenerate triangles if we're conservatively rasterizing
536 uint32_t origTriMask = triMask;
537 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
538 {
539 triMask &= ~cullZeroAreaMask;
540 }
541
542 // determine front winding tris
543 // CW +det
544 // CCW det < 0;
545 // 0 area triangles are marked as backfacing regardless of winding order,
546 // which is required behavior for conservative rast and wireframe rendering
547 uint32_t frontWindingTris;
548 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
549 {
550 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
551 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
552 }
553 else
554 {
555 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
556 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
557 }
558 frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
559
560 // cull
561 uint32_t cullTris;
562 switch ((SWR_CULLMODE)rastState.cullMode)
563 {
564 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
565 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
566 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
567 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
568 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
569 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
570 }
571
572 triMask &= ~cullTris;
573
574 if (origTriMask ^ triMask)
575 {
576 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
577 }
578
579 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
580 // compute per tri backface
581 uint32_t frontFaceMask = frontWindingTris;
582 uint32_t *pPrimID = (uint32_t *)&primID;
583 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
584 DWORD triIndex = 0;
585
586 uint32_t edgeEnable;
587 PFN_WORK_FUNC pfnWork;
588 if (CT::IsConservativeT::value)
589 {
590 // determine which edges of the degenerate tri, if any, are valid to rasterize.
591 // used to call the appropriate templated rasterizer function
592 if (cullZeroAreaMask > 0)
593 {
594 // e0 = v1-v0
595 const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
596 const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
597
598 uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
599
600 // e1 = v2-v1
601 const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
602 const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
603
604 uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
605
606 // e2 = v0-v2
607 // if v0 == v1 & v1 == v2, v0 == v2
608 uint32_t e2Mask = e0Mask & e1Mask;
609 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
610
611 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
612 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
613 e0Mask = pdep_u32(e0Mask, 0x00249249);
614
615 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
616 e1Mask = pdep_u32(e1Mask, 0x00492492);
617
618 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
619 e2Mask = pdep_u32(e2Mask, 0x00924924);
620
621 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
622 }
623 else
624 {
625 edgeEnable = 0x00FFFFFF;
626 }
627 }
628 else
629 {
630 // degenerate triangles won't be sent to rasterizer; just enable all edges
631 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
632 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
633 }
634
635 SIMDBBOX_T<SIMD_T> bbox;
636
637 if (!triMask)
638 {
639 goto endBinTriangles;
640 }
641
642 // Calc bounding box of triangles
643 calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
644
645 // determine if triangle falls between pixel centers and discard
646 // only discard for non-MSAA case and when conservative rast is disabled
647 // (xmin + 127) & ~255
648 // (xmax + 128) & ~255
649 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
650 (!CT::IsConservativeT::value))
651 {
652 origTriMask = triMask;
653
654 int cullCenterMask;
655
656 {
657 typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
658 xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
659 typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
660 xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
661
662 typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
663
664 typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
665 ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
666 typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
667 ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
668
669 typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
670
671 vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
672 cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
673 }
674
675 triMask &= ~cullCenterMask;
676
677 if (origTriMask ^ triMask)
678 {
679 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
680 }
681 }
682
683 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
684 // Gather the AOS effective scissor rects based on the per-prim VP index.
685 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
686 {
687 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
688
689 if (state.backendState.readViewportArrayIndex)
690 {
691 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
692 }
693 else // broadcast fast path for non-VPAI case.
694 {
695 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
696 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
697 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
698 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
699 }
700
701 // Make triangle bbox inclusive
702 bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
703 bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
704
705 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
706 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
707 bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
708 bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
709 }
710
711 if (CT::IsConservativeT::value)
712 {
713 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
714 // some area. Bump the xmax/ymax edges out
715
716 typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
717 bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
718
719 typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
720 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
721 }
722
723 // Cull tris completely outside scissor
724 {
725 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
726 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
727 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
728 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
729 triMask = triMask & ~maskOutsideScissor;
730 }
731
732 endBinTriangles:
733
734
735 // Send surviving triangles to the line or point binner based on fill mode
736 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
737 {
738 // Simple non-conformant wireframe mode, useful for debugging
739 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
740 typename SIMD_T::Vec4 line[2];
741 typename SIMD_T::Float recipW[2];
742
743 line[0] = tri[0];
744 line[1] = tri[1];
745 recipW[0] = vRecipW0;
746 recipW[1] = vRecipW1;
747
748 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
749
750 line[0] = tri[1];
751 line[1] = tri[2];
752 recipW[0] = vRecipW1;
753 recipW[1] = vRecipW2;
754
755 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
756
757 line[0] = tri[2];
758 line[1] = tri[0];
759 recipW[0] = vRecipW2;
760 recipW[1] = vRecipW0;
761
762 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
763
764 AR_END(FEBinTriangles, 1);
765 return;
766 }
767 else if (rastState.fillMode == SWR_FILLMODE_POINT)
768 {
769 // Bin 3 points
770 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
771 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
772 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
773
774 AR_END(FEBinTriangles, 1);
775 return;
776 }
777
778 // Convert triangle bbox to macrotile units.
779 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
780 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
781 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
782 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
783
784 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
785
786 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
787 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
788 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
789 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
790
791 // transpose verts needed for backend
792 /// @todo modify BE to take non-transformed verts
793 simd4scalar vHorizX[SIMD_WIDTH];
794 simd4scalar vHorizY[SIMD_WIDTH];
795 simd4scalar vHorizZ[SIMD_WIDTH];
796 simd4scalar vHorizW[SIMD_WIDTH];
797
798 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
799 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
800 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
801 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
802
803 // store render target array index
804 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
805 if (state.backendState.readRenderTargetArrayIndex)
806 {
807 typename SIMD_T::Vec4 vRtai[3];
808 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
809 typename SIMD_T::Integer vRtaii;
810 vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
811 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
812 }
813 else
814 {
815 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
816 }
817
818 // scan remaining valid triangles and bin each separately
819 while (_BitScanForward(&triIndex, triMask))
820 {
821 uint32_t linkageCount = state.backendState.numAttributes;
822 uint32_t numScalarAttribs = linkageCount * 4;
823
824 BE_WORK work;
825 work.type = DRAW;
826
827 bool isDegenerate;
828 if (CT::IsConservativeT::value)
829 {
830 // only rasterize valid edges if we have a degenerate primitive
831 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
832 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
833 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
834
835 // Degenerate triangles are required to be constant interpolated
836 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
837 }
838 else
839 {
840 isDegenerate = false;
841 work.pfnWork = pfnWork;
842 }
843
844 // Select attribute processor
845 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
846 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
847
848 TRIANGLE_WORK_DESC &desc = work.desc.tri;
849
850 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
851 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
852 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
853
854 auto pArena = pDC->pArena;
855 SWR_ASSERT(pArena != nullptr);
856
857 // store active attribs
858 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
859 desc.pAttribs = pAttribs;
860 desc.numAttribs = linkageCount;
861 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
862
863 // store triangle vertex data
864 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
865
866 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
867 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
868 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
869 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
870
871 // store user clip distances
872 if (state.backendState.clipDistanceMask)
873 {
874 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
875 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
876 ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
877 }
878
879 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
880 {
881 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
882 {
883 #if KNOB_ENABLE_TOSS_POINTS
884 if (!KNOB_TOSS_SETUP_TRIS)
885 #endif
886 {
887 pTileMgr->enqueue(x, y, &work);
888 }
889 }
890 }
891
892 triMask &= ~(1 << triIndex);
893 }
894
895 AR_END(FEBinTriangles, 1);
896 }
897
898 template <typename CT>
899 void BinTriangles(
900 DRAW_CONTEXT *pDC,
901 PA_STATE &pa,
902 uint32_t workerId,
903 simdvector tri[3],
904 uint32_t triMask,
905 simdscalari const &primID)
906 {
907 BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
908 }
909
910 #if USE_SIMD16_FRONTEND
911 template <typename CT>
912 void SIMDCALL BinTriangles_simd16(
913 DRAW_CONTEXT *pDC,
914 PA_STATE &pa,
915 uint32_t workerId,
916 simd16vector tri[3],
917 uint32_t triMask,
918 simd16scalari const &primID)
919 {
920 BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
921 }
922
923 #endif
924 struct FEBinTrianglesChooser
925 {
926 typedef PFN_PROCESS_PRIMS FuncType;
927
928 template <typename... ArgsB>
929 static FuncType GetFunc()
930 {
931 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
932 }
933 };
934
935 // Selector for correct templated BinTrinagles function
936 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
937 {
938 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
939 }
940
941 #if USE_SIMD16_FRONTEND
942 struct FEBinTrianglesChooser_simd16
943 {
944 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
945
946 template <typename... ArgsB>
947 static FuncType GetFunc()
948 {
949 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
950 }
951 };
952
953 // Selector for correct templated BinTrinagles function
954 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
955 {
956 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
957 }
958
959 #endif
960
961 template <typename SIMD_T, uint32_t SIMD_WIDTH>
962 void BinPostSetupPointsImpl(
963 DRAW_CONTEXT *pDC,
964 PA_STATE &pa,
965 uint32_t workerId,
966 typename SIMD_T::Vec4 prim[],
967 uint32_t primMask,
968 typename SIMD_T::Integer const &primID,
969 typename SIMD_T::Integer const &viewportIdx)
970 {
971 SWR_CONTEXT *pContext = pDC->pContext;
972
973 AR_BEGIN(FEBinPoints, pDC->drawId);
974
975 typename SIMD_T::Vec4 &primVerts = prim[0];
976
977 const API_STATE& state = GetApiState(pDC);
978 const SWR_RASTSTATE& rastState = state.rastState;
979 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
980
981 // Select attribute processor
982 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
983 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
984
985 // convert to fixed point
986 typename SIMD_T::Integer vXi, vYi;
987
988 vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
989 vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
990
991 if (CanUseSimplePoints(pDC))
992 {
993 // adjust for ymin-xmin rule
994 vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
995 vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
996
997 // cull points off the ymin-xmin edge of the viewport
998 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
999 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1000
1001 // compute macro tile coordinates
1002 typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1003 typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1004
1005 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1006
1007 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1008 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1009
1010 // compute raster tile coordinates
1011 typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1012 typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1013
1014 // compute raster tile relative x,y for coverage mask
1015 typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1016 typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1017
1018 typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1019 typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1020
1021 OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1022 OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1023
1024 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1025 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1026
1027 OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1028 OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1029
1030 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1031 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1032
1033 OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1034 SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1035
1036 // store render target array index
1037 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1038 if (state.backendState.readRenderTargetArrayIndex)
1039 {
1040 typename SIMD_T::Vec4 vRtai;
1041 pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
1042 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
1043 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1044 }
1045 else
1046 {
1047 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1048 }
1049
1050 uint32_t *pPrimID = (uint32_t *)&primID;
1051 DWORD primIndex = 0;
1052
1053 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1054
1055 // scan remaining valid triangles and bin each separately
1056 while (_BitScanForward(&primIndex, primMask))
1057 {
1058 uint32_t linkageCount = backendState.numAttributes;
1059 uint32_t numScalarAttribs = linkageCount * 4;
1060
1061 BE_WORK work;
1062 work.type = DRAW;
1063
1064 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1065
1066 // points are always front facing
1067 desc.triFlags.frontFacing = 1;
1068 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1069 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1070
1071 work.pfnWork = RasterizeSimplePoint;
1072
1073 auto pArena = pDC->pArena;
1074 SWR_ASSERT(pArena != nullptr);
1075
1076 // store attributes
1077 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1078 desc.pAttribs = pAttribs;
1079 desc.numAttribs = linkageCount;
1080
1081 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1082
1083 // store raster tile aligned x, y, perspective correct z
1084 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1085 desc.pTriBuffer = pTriBuffer;
1086 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1087 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1088 *pTriBuffer = aZ[primIndex];
1089
1090 uint32_t tX = aTileRelativeX[primIndex];
1091 uint32_t tY = aTileRelativeY[primIndex];
1092
1093 // pack the relative x,y into the coverageMask, the rasterizer will
1094 // generate the true coverage mask from it
1095 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1096
1097 // bin it
1098 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1099 #if KNOB_ENABLE_TOSS_POINTS
1100 if (!KNOB_TOSS_SETUP_TRIS)
1101 #endif
1102 {
1103 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1104 }
1105
1106 primMask &= ~(1 << primIndex);
1107 }
1108 }
1109 else
1110 {
1111 // non simple points need to be potentially binned to multiple macro tiles
1112 typename SIMD_T::Float vPointSize;
1113
1114 if (rastState.pointParam)
1115 {
1116 typename SIMD_T::Vec4 size[3];
1117 pa.Assemble(VERTEX_SGV_SLOT, size);
1118 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1119 }
1120 else
1121 {
1122 vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1123 }
1124
1125 // bloat point to bbox
1126 SIMDBBOX_T<SIMD_T> bbox;
1127
1128 bbox.xmin = bbox.xmax = vXi;
1129 bbox.ymin = bbox.ymax = vYi;
1130
1131 typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1132 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1133
1134 bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1135 bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1136 bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1137 bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1138
1139 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1140 // Gather the AOS effective scissor rects based on the per-prim VP index.
1141 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1142 {
1143 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1144
1145 if (state.backendState.readViewportArrayIndex)
1146 {
1147 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1148 }
1149 else // broadcast fast path for non-VPAI case.
1150 {
1151 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1152 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1153 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1154 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1155 }
1156
1157 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1158 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1159 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1160 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1161 }
1162
1163 // Cull bloated points completely outside scissor
1164 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1165 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1166 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1167 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1168 primMask = primMask & ~maskOutsideScissor;
1169
1170 // Convert bbox to macrotile units.
1171 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1172 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1173 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1174 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1175
1176 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1177
1178 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1179 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1180 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1181 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1182
1183 // store render target array index
1184 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1185 if (state.backendState.readRenderTargetArrayIndex)
1186 {
1187 typename SIMD_T::Vec4 vRtai[2];
1188 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1189 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1190 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1191 }
1192 else
1193 {
1194 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1195 }
1196
1197 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1198 _simd16_store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1199
1200 uint32_t *pPrimID = (uint32_t *)&primID;
1201
1202 OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1203 OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1204 OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1205
1206 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1207 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1208 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1209
1210 // scan remaining valid prims and bin each separately
1211 const SWR_BACKEND_STATE& backendState = state.backendState;
1212 DWORD primIndex;
1213 while (_BitScanForward(&primIndex, primMask))
1214 {
1215 uint32_t linkageCount = backendState.numAttributes;
1216 uint32_t numScalarAttribs = linkageCount * 4;
1217
1218 BE_WORK work;
1219 work.type = DRAW;
1220
1221 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1222
1223 desc.triFlags.frontFacing = 1;
1224 desc.triFlags.pointSize = aPointSize[primIndex];
1225 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1226 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1227
1228 work.pfnWork = RasterizeTriPoint;
1229
1230 auto pArena = pDC->pArena;
1231 SWR_ASSERT(pArena != nullptr);
1232
1233 // store active attribs
1234 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1235 desc.numAttribs = linkageCount;
1236 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1237
1238 // store point vertex data
1239 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1240 desc.pTriBuffer = pTriBuffer;
1241 *pTriBuffer++ = aPrimVertsX[primIndex];
1242 *pTriBuffer++ = aPrimVertsY[primIndex];
1243 *pTriBuffer = aPrimVertsZ[primIndex];
1244
1245 // store user clip distances
1246 if (backendState.clipDistanceMask)
1247 {
1248 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1249 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1250 float dists[8];
1251 float one = 1.0f;
1252 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1253 for (uint32_t i = 0; i < numClipDist; i++) {
1254 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1255 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1256 desc.pUserClipBuffer[3 * i + 2] = dists[i];
1257 }
1258 }
1259
1260 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1261 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1262 {
1263 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1264 {
1265 #if KNOB_ENABLE_TOSS_POINTS
1266 if (!KNOB_TOSS_SETUP_TRIS)
1267 #endif
1268 {
1269 pTileMgr->enqueue(x, y, &work);
1270 }
1271 }
1272 }
1273
1274 primMask &= ~(1 << primIndex);
1275 }
1276 }
1277
1278 AR_END(FEBinPoints, 1);
1279 }
1280
1281 //////////////////////////////////////////////////////////////////////////
1282 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1283 /// @param pDC - pointer to draw context.
1284 /// @param pa - The primitive assembly object.
1285 /// @param workerId - thread's worker id. Even thread has a unique id.
1286 /// @param tri - Contains point position data for SIMDs worth of points.
1287 /// @param primID - Primitive ID for each point.
1288 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1289 void BinPointsImpl(
1290 DRAW_CONTEXT *pDC,
1291 PA_STATE &pa,
1292 uint32_t workerId,
1293 typename SIMD_T::Vec4 prim[3],
1294 uint32_t primMask,
1295 typename SIMD_T::Integer const &primID)
1296 {
1297 const API_STATE& state = GetApiState(pDC);
1298 const SWR_FRONTEND_STATE& feState = state.frontendState;
1299 const SWR_RASTSTATE& rastState = state.rastState;
1300
1301 // Read back viewport index if required
1302 typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
1303 if (state.backendState.readViewportArrayIndex)
1304 {
1305 typename SIMD_T::Vec4 vpiAttrib[1];
1306 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1307
1308 // OOB indices => forced to zero.
1309 typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1310 vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
1311 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1312 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1313 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1314 }
1315
1316 if (!feState.vpTransformDisable)
1317 {
1318 // perspective divide
1319 typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1320
1321 prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1322 prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1323 prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1324
1325 // viewport transform to screen coords
1326 if (state.backendState.readViewportArrayIndex)
1327 {
1328 viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1329 }
1330 else
1331 {
1332 viewportTransform<1>(prim, state.vpMatrices);
1333 }
1334 }
1335
1336 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1337
1338 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1339 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1340
1341 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1342 pDC,
1343 pa,
1344 workerId,
1345 prim,
1346 primMask,
1347 primID,
1348 viewportIdx);
1349 }
1350
1351 void BinPoints(
1352 DRAW_CONTEXT *pDC,
1353 PA_STATE &pa,
1354 uint32_t workerId,
1355 simdvector prim[3],
1356 uint32_t primMask,
1357 simdscalari const &primID)
1358 {
1359 BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1360 pDC,
1361 pa,
1362 workerId,
1363 prim,
1364 primMask,
1365 primID);
1366 }
1367
1368 #if USE_SIMD16_FRONTEND
1369 void SIMDCALL BinPoints_simd16(
1370 DRAW_CONTEXT *pDC,
1371 PA_STATE &pa,
1372 uint32_t workerId,
1373 simd16vector prim[3],
1374 uint32_t primMask,
1375 simd16scalari const &primID)
1376 {
1377 BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1378 pDC,
1379 pa,
1380 workerId,
1381 prim,
1382 primMask,
1383 primID);
1384 }
1385
1386 #endif
1387 //////////////////////////////////////////////////////////////////////////
1388 /// @brief Bin SIMD lines to the backend.
1389 /// @param pDC - pointer to draw context.
1390 /// @param pa - The primitive assembly object.
1391 /// @param workerId - thread's worker id. Even thread has a unique id.
1392 /// @param tri - Contains line position data for SIMDs worth of points.
1393 /// @param primID - Primitive ID for each line.
1394 /// @param viewportIdx - Viewport Array Index for each line.
1395 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1396 void BinPostSetupLinesImpl(
1397 DRAW_CONTEXT *pDC,
1398 PA_STATE &pa,
1399 uint32_t workerId,
1400 typename SIMD_T::Vec4 prim[],
1401 typename SIMD_T::Float recipW[],
1402 uint32_t primMask,
1403 typename SIMD_T::Integer const &primID,
1404 typename SIMD_T::Integer const &viewportIdx)
1405 {
1406 SWR_CONTEXT *pContext = pDC->pContext;
1407
1408 AR_BEGIN(FEBinLines, pDC->drawId);
1409
1410 const API_STATE &state = GetApiState(pDC);
1411 const SWR_RASTSTATE &rastState = state.rastState;
1412
1413 // Select attribute processor
1414 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1415 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1416
1417 typename SIMD_T::Float &vRecipW0 = recipW[0];
1418 typename SIMD_T::Float &vRecipW1 = recipW[1];
1419
1420 // convert to fixed point
1421 typename SIMD_T::Integer vXi[2], vYi[2];
1422
1423 vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1424 vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1425 vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1426 vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1427
1428 // compute x-major vs y-major mask
1429 typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1430 typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1431 typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1432 uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1433
1434 // cull zero-length lines
1435 typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1436 vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1437
1438 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1439
1440 uint32_t *pPrimID = (uint32_t *)&primID;
1441 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1442
1443 // Calc bounding box of lines
1444 SIMDBBOX_T<SIMD_T> bbox;
1445 bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1446 bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1447 bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1448 bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1449
1450 // bloat bbox by line width along minor axis
1451 typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1452 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1453
1454 SIMDBBOX_T<SIMD_T> bloatBox;
1455
1456 bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1457 bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1458 bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1459 bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1460
1461 bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1462 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1463 bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1464 bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1465
1466 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1467 {
1468 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1469
1470 if (state.backendState.readViewportArrayIndex)
1471 {
1472 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1473 }
1474 else // broadcast fast path for non-VPAI case.
1475 {
1476 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1477 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1478 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1479 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1480 }
1481
1482 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1483 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1484 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1485 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1486 }
1487
1488 // Cull prims completely outside scissor
1489 {
1490 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1491 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1492 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1493 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1494 primMask = primMask & ~maskOutsideScissor;
1495 }
1496
1497 // transpose verts needed for backend
1498 /// @todo modify BE to take non-transformed verts
1499 simd4scalar vHorizX[SIMD_WIDTH];
1500 simd4scalar vHorizY[SIMD_WIDTH];
1501 simd4scalar vHorizZ[SIMD_WIDTH];
1502 simd4scalar vHorizW[SIMD_WIDTH];
1503
1504 if (!primMask)
1505 {
1506 goto endBinLines;
1507 }
1508
1509 // Convert triangle bbox to macrotile units.
1510 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1511 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1512 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1513 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1514
1515 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1516
1517 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1518 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1519 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1520 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1521
1522 TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1523 TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1524 TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1525 TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
1526
1527 // store render target array index
1528 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1529 if (state.backendState.readRenderTargetArrayIndex)
1530 {
1531 typename SIMD_T::Vec4 vRtai[2];
1532 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1533 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1534 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1535 }
1536 else
1537 {
1538 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1539 }
1540
1541 // scan remaining valid prims and bin each separately
1542 DWORD primIndex;
1543 while (_BitScanForward(&primIndex, primMask))
1544 {
1545 uint32_t linkageCount = state.backendState.numAttributes;
1546 uint32_t numScalarAttribs = linkageCount * 4;
1547
1548 BE_WORK work;
1549 work.type = DRAW;
1550
1551 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1552
1553 desc.triFlags.frontFacing = 1;
1554 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1555 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1556 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1557
1558 work.pfnWork = RasterizeLine;
1559
1560 auto pArena = pDC->pArena;
1561 SWR_ASSERT(pArena != nullptr);
1562
1563 // store active attribs
1564 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1565 desc.numAttribs = linkageCount;
1566 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1567
1568 // store line vertex data
1569 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1570
1571 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1572 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1573 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1574 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1575
1576 // store user clip distances
1577 if (state.backendState.clipDistanceMask)
1578 {
1579 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1580 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1581 ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1582 }
1583
1584 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1585 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1586 {
1587 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1588 {
1589 #if KNOB_ENABLE_TOSS_POINTS
1590 if (!KNOB_TOSS_SETUP_TRIS)
1591 #endif
1592 {
1593 pTileMgr->enqueue(x, y, &work);
1594 }
1595 }
1596 }
1597
1598 primMask &= ~(1 << primIndex);
1599 }
1600
1601 endBinLines:
1602
1603 AR_END(FEBinLines, 1);
1604 }
1605
1606 //////////////////////////////////////////////////////////////////////////
1607 /// @brief Bin SIMD lines to the backend.
1608 /// @param pDC - pointer to draw context.
1609 /// @param pa - The primitive assembly object.
1610 /// @param workerId - thread's worker id. Even thread has a unique id.
1611 /// @param tri - Contains line position data for SIMDs worth of points.
1612 /// @param primID - Primitive ID for each line.
1613 /// @param viewportIdx - Viewport Array Index for each line.
1614 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1615 void SIMDCALL BinLinesImpl(
1616 DRAW_CONTEXT *pDC,
1617 PA_STATE &pa,
1618 uint32_t workerId,
1619 typename SIMD_T::Vec4 prim[3],
1620 uint32_t primMask,
1621 typename SIMD_T::Integer const &primID)
1622 {
1623 const API_STATE& state = GetApiState(pDC);
1624 const SWR_RASTSTATE& rastState = state.rastState;
1625 const SWR_FRONTEND_STATE& feState = state.frontendState;
1626
1627 typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1628
1629 typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
1630 if (state.backendState.readViewportArrayIndex)
1631 {
1632 typename SIMD_T::Vec4 vpiAttrib[2];
1633 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1634
1635 // OOB indices => forced to zero.
1636 typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1637 vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
1638 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1639 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1640 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1641 }
1642
1643 if (!feState.vpTransformDisable)
1644 {
1645 // perspective divide
1646 vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1647 vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1648
1649 prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1650 prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1651
1652 prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1653 prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1654
1655 prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1656 prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1657
1658 // viewport transform to screen coords
1659 if (state.backendState.readViewportArrayIndex)
1660 {
1661 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1662 }
1663 else
1664 {
1665 viewportTransform<2>(prim, state.vpMatrices);
1666 }
1667 }
1668
1669 // adjust for pixel center location
1670 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1671
1672 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1673 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1674
1675 prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1676 prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1677
1678 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1679 pDC,
1680 pa,
1681 workerId,
1682 prim,
1683 vRecipW,
1684 primMask,
1685 primID,
1686 viewportIdx);
1687 }
1688
1689 void BinLines(
1690 DRAW_CONTEXT *pDC,
1691 PA_STATE &pa,
1692 uint32_t workerId,
1693 simdvector prim[],
1694 uint32_t primMask,
1695 simdscalari const &primID)
1696 {
1697 BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1698 }
1699
1700 #if USE_SIMD16_FRONTEND
1701 void SIMDCALL BinLines_simd16(
1702 DRAW_CONTEXT *pDC,
1703 PA_STATE &pa,
1704 uint32_t workerId,
1705 simd16vector prim[3],
1706 uint32_t primMask,
1707 simd16scalari const &primID)
1708 {
1709 BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1710 }
1711
1712 #endif