swr/rast: stop using MSFT types in platform independent code
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
40 void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
41
42 #if USE_SIMD16_FRONTEND
43 void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
44 void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
45 #endif
46
47 //////////////////////////////////////////////////////////////////////////
48 /// @brief Processes attributes for the backend based on linkage mask and
49 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
50 /// @param pDC - Draw context
51 /// @param pa - Primitive Assembly state
52 /// @param linkageMask - Specifies which VS outputs are routed to PS.
53 /// @param pLinkageMap - maps VS attribute slot to PS slot
54 /// @param triIndex - Triangle to process attributes for
55 /// @param pBuffer - Output result
56 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
57 INLINE void ProcessAttributes(
58 DRAW_CONTEXT *pDC,
59 PA_STATE&pa,
60 uint32_t triIndex,
61 uint32_t primId,
62 float *pBuffer)
63 {
64 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
65 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
66 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
67 uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
68 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
69 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
70
71 static const float constTable[3][4] = {
72 { 0.0f, 0.0f, 0.0f, 0.0f },
73 { 0.0f, 0.0f, 0.0f, 1.0f },
74 { 1.0f, 1.0f, 1.0f, 1.0f }
75 };
76
77 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
78 {
79 uint32_t inputSlot;
80 if (IsSwizzledT::value)
81 {
82 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
83 inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
84
85 }
86 else
87 {
88 inputSlot = backendState.vertexAttribOffset + i;
89 }
90
91 simd4scalar attrib[3]; // triangle attribs (always 4 wide)
92 float* pAttribStart = pBuffer;
93
94 if (HasConstantInterpT::value || IsDegenerate::value)
95 {
96 if (CheckBit(constantInterpMask, i))
97 {
98 uint32_t vid;
99 uint32_t adjustedTriIndex;
100 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
101 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
102 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
103 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
104 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
105
106 switch (topo) {
107 case TOP_QUAD_LIST:
108 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
109 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
110 break;
111 case TOP_QUAD_STRIP:
112 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
113 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
114 break;
115 case TOP_TRIANGLE_STRIP:
116 adjustedTriIndex = triIndex;
117 vid = (triIndex & 1)
118 ? tristripProvokingVertex[provokingVertex]
119 : provokingVertex;
120 break;
121 default:
122 adjustedTriIndex = triIndex;
123 vid = provokingVertex;
124 break;
125 }
126
127 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
128
129 for (uint32_t i = 0; i < NumVertsT::value; ++i)
130 {
131 SIMD128::store_ps(pBuffer, attrib[vid]);
132 pBuffer += 4;
133 }
134 }
135 else
136 {
137 pa.AssembleSingle(inputSlot, triIndex, attrib);
138
139 for (uint32_t i = 0; i < NumVertsT::value; ++i)
140 {
141 SIMD128::store_ps(pBuffer, attrib[i]);
142 pBuffer += 4;
143 }
144 }
145 }
146 else
147 {
148 pa.AssembleSingle(inputSlot, triIndex, attrib);
149
150 for (uint32_t i = 0; i < NumVertsT::value; ++i)
151 {
152 SIMD128::store_ps(pBuffer, attrib[i]);
153 pBuffer += 4;
154 }
155 }
156
157 // pad out the attrib buffer to 3 verts to ensure the triangle
158 // interpolation code in the pixel shader works correctly for the
159 // 3 topologies - point, line, tri. This effectively zeros out the
160 // effect of the missing vertices in the triangle interpolation.
161 for (uint32_t v = NumVertsT::value; v < 3; ++v)
162 {
163 SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
164 pBuffer += 4;
165 }
166
167 // check for constant source overrides
168 if (IsSwizzledT::value)
169 {
170 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
171 if (mask)
172 {
173 DWORD comp;
174 while (_BitScanForward(&comp, mask))
175 {
176 mask &= ~(1 << comp);
177
178 float constantValue = 0.0f;
179 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
180 {
181 case SWR_CONSTANT_SOURCE_CONST_0000:
182 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
183 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
184 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
185 break;
186 case SWR_CONSTANT_SOURCE_PRIM_ID:
187 constantValue = *(float*)&primId;
188 break;
189 }
190
191 // apply constant value to all 3 vertices
192 for (uint32_t v = 0; v < 3; ++v)
193 {
194 pAttribStart[comp + v * 4] = constantValue;
195 }
196 }
197 }
198 }
199 }
200 }
201
202 //////////////////////////////////////////////////////////////////////////
203 /// @brief Gather scissor rect data based on per-prim viewport indices.
204 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
205 /// @param pViewportIndex - array of per-primitive vewport indexes.
206 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
207 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
208 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
209 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
210 //
211 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
212 template<size_t SimdWidth>
213 struct GatherScissors
214 {
215 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
216 simdscalari &scisXmin, simdscalari &scisYmin,
217 simdscalari &scisXmax, simdscalari &scisYmax)
218 {
219 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
220 }
221 };
222
223 template<>
224 struct GatherScissors<8>
225 {
226 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
227 simdscalari &scisXmin, simdscalari &scisYmin,
228 simdscalari &scisXmax, simdscalari &scisYmax)
229 {
230 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
231 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
232 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
233 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
234 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
235 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
236 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
237 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
238 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
239 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
240 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
241 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
242 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
243 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
244 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
245 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
246 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
247 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
248 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
249 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
250 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
251 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
252 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
253 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
254 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
255 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
256 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
257 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
258 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
259 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
260 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
261 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
262 }
263 };
264
265 #if USE_SIMD16_FRONTEND
266 template<size_t SimdWidth>
267 struct GatherScissors_simd16
268 {
269 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
270 simd16scalari &scisXmin, simd16scalari &scisYmin,
271 simd16scalari &scisXmax, simd16scalari &scisYmax)
272 {
273 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
274 }
275 };
276
277 template<>
278 struct GatherScissors_simd16<16>
279 {
280 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
281 simd16scalari &scisXmin, simd16scalari &scisYmin,
282 simd16scalari &scisXmax, simd16scalari &scisYmax) {
283 scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
284 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
285 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
286 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
287 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
288 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
289 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
290 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
291 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
292 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
293 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
294 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
295 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
296 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
297 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
298 pScissorsInFixedPoint[pViewportIndex[15]].xmin);
299
300 scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
301 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
302 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
303 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
304 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
305 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
306 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
307 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
308 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
309 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
310 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
311 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
312 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
313 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
314 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
315 pScissorsInFixedPoint[pViewportIndex[15]].ymin);
316
317 scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
318 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
319 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
320 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
321 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
322 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
323 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
324 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
325 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
326 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
327 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
328 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
329 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
330 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
331 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
332 pScissorsInFixedPoint[pViewportIndex[15]].xmax);
333
334 scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
335 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
336 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
337 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
338 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
339 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
340 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
341 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
342 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
343 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
344 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
345 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
346 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
347 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
348 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
349 pScissorsInFixedPoint[pViewportIndex[15]].ymax);
350 }
351 };
352
353 #endif
354 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
355
356 struct ProcessAttributesChooser
357 {
358 typedef PFN_PROCESS_ATTRIBUTES FuncType;
359
360 template <typename... ArgsB>
361 static FuncType GetFunc()
362 {
363 return ProcessAttributes<ArgsB...>;
364 }
365 };
366
367 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
368 {
369 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
370 }
371
372 //////////////////////////////////////////////////////////////////////////
373 /// @brief Processes enabled user clip distances. Loads the active clip
374 /// distances from the PA, sets up barycentric equations, and
375 /// stores the results to the output buffer
376 /// @param pa - Primitive Assembly state
377 /// @param primIndex - primitive index to process
378 /// @param clipDistMask - mask of enabled clip distances
379 /// @param pUserClipBuffer - buffer to store results
380 template<uint32_t NumVerts>
381 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer)
382 {
383 DWORD clipDist;
384 while (_BitScanForward(&clipDist, clipDistMask))
385 {
386 clipDistMask &= ~(1 << clipDist);
387 uint32_t clipSlot = clipDist >> 2;
388 uint32_t clipComp = clipDist & 0x3;
389 uint32_t clipAttribSlot = clipSlot == 0 ?
390 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
391
392 simd4scalar primClipDist[3];
393 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
394
395 float vertClipDist[NumVerts];
396 for (uint32_t e = 0; e < NumVerts; ++e)
397 {
398 OSALIGNSIMD(float) aVertClipDist[4];
399 SIMD128::store_ps(aVertClipDist, primClipDist[e]);
400 vertClipDist[e] = aVertClipDist[clipComp];
401 };
402
403 // setup plane equations for barycentric interpolation in the backend
404 float baryCoeff[NumVerts];
405 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
406 for (uint32_t e = 0; e < NumVerts - 1; ++e)
407 {
408 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
409 }
410 baryCoeff[NumVerts - 1] = last;
411
412 for (uint32_t e = 0; e < NumVerts; ++e)
413 {
414 *(pUserClipBuffer++) = baryCoeff[e];
415 }
416 }
417 }
418
419 //////////////////////////////////////////////////////////////////////////
420 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
421 /// culling, viewport transform, etc.
422 /// @param pDC - pointer to draw context.
423 /// @param pa - The primitive assembly object.
424 /// @param workerId - thread's worker id. Even thread has a unique id.
425 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
426 /// @param primID - Primitive ID for each triangle.
427 /// @param viewportIdx - viewport array index for each triangle.
428 /// @tparam CT - ConservativeRastFETraits
429 template <typename CT>
430 void BinTriangles(
431 DRAW_CONTEXT *pDC,
432 PA_STATE& pa,
433 uint32_t workerId,
434 simdvector tri[3],
435 uint32_t triMask,
436 simdscalari primID)
437 {
438 SWR_CONTEXT *pContext = pDC->pContext;
439
440 AR_BEGIN(FEBinTriangles, pDC->drawId);
441
442 const API_STATE& state = GetApiState(pDC);
443 const SWR_RASTSTATE& rastState = state.rastState;
444 const SWR_FRONTEND_STATE& feState = state.frontendState;
445 MacroTileMgr *pTileMgr = pDC->pTileMgr;
446
447 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
448 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
449 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
450
451 // Read viewport array index if needed
452 simdscalari viewportIdx = _simd_set1_epi32(0);
453 if (state.backendState.readViewportArrayIndex)
454 {
455 simdvector vpiAttrib[3];
456 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
457
458 // OOB indices => forced to zero.
459 simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
460 vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
461 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
462 simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
463 viewportIdx = _simd_and_si(vClearMask, vpai);
464 }
465
466 if (feState.vpTransformDisable)
467 {
468 // RHW is passed in directly when VP transform is disabled
469 vRecipW0 = tri[0].v[3];
470 vRecipW1 = tri[1].v[3];
471 vRecipW2 = tri[2].v[3];
472 }
473 else
474 {
475 // Perspective divide
476 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
477 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
478 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
479
480 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
481 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
482 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
483
484 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
485 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
486 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
487
488 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
489 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
490 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
491
492 // Viewport transform to screen space coords
493 if (state.backendState.readViewportArrayIndex)
494 {
495 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
496 }
497 else
498 {
499 viewportTransform<3>(tri, state.vpMatrices);
500 }
501 }
502
503 // Adjust for pixel center location
504 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
505 tri[0].x = _simd_add_ps(tri[0].x, offset);
506 tri[0].y = _simd_add_ps(tri[0].y, offset);
507
508 tri[1].x = _simd_add_ps(tri[1].x, offset);
509 tri[1].y = _simd_add_ps(tri[1].y, offset);
510
511 tri[2].x = _simd_add_ps(tri[2].x, offset);
512 tri[2].y = _simd_add_ps(tri[2].y, offset);
513
514 simdscalari vXi[3], vYi[3];
515 // Set vXi, vYi to required fixed point precision
516 FPToFixedPoint(tri, vXi, vYi);
517
518 // triangle setup
519 simdscalari vAi[3], vBi[3];
520 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
521
522 // determinant
523 simdscalari vDet[2];
524 calcDeterminantIntVertical(vAi, vBi, vDet);
525
526 // cull zero area
527 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
528 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
529
530 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
531
532 uint32_t origTriMask = triMask;
533 // don't cull degenerate triangles if we're conservatively rasterizing
534 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
535 {
536 triMask &= ~cullZeroAreaMask;
537 }
538
539 // determine front winding tris
540 // CW +det
541 // CCW det < 0;
542 // 0 area triangles are marked as backfacing regardless of winding order,
543 // which is required behavior for conservative rast and wireframe rendering
544 uint32_t frontWindingTris;
545 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
546 {
547 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
548 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
549 }
550 else
551 {
552 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
553 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
554 }
555 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
556
557 // cull
558 uint32_t cullTris;
559 switch ((SWR_CULLMODE)rastState.cullMode)
560 {
561 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
562 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
563 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
564 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
565 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
566 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
567 }
568
569 triMask &= ~cullTris;
570
571 if (origTriMask ^ triMask)
572 {
573 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
574 }
575
576 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
577 // compute per tri backface
578 uint32_t frontFaceMask = frontWindingTris;
579 uint32_t *pPrimID = (uint32_t *)&primID;
580 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
581 DWORD triIndex = 0;
582 uint32_t edgeEnable;
583 PFN_WORK_FUNC pfnWork;
584 if (CT::IsConservativeT::value)
585 {
586 // determine which edges of the degenerate tri, if any, are valid to rasterize.
587 // used to call the appropriate templated rasterizer function
588 if (cullZeroAreaMask > 0)
589 {
590 // e0 = v1-v0
591 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
592 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
593 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
594
595 // e1 = v2-v1
596 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
597 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
598 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
599
600 // e2 = v0-v2
601 // if v0 == v1 & v1 == v2, v0 == v2
602 uint32_t e2Mask = e0Mask & e1Mask;
603 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
604
605 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
606 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
607 e0Mask = pdep_u32(e0Mask, 0x00249249);
608 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
609 e1Mask = pdep_u32(e1Mask, 0x00492492);
610 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
611 e2Mask = pdep_u32(e2Mask, 0x00924924);
612
613 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
614 }
615 else
616 {
617 edgeEnable = 0x00FFFFFF;
618 }
619 }
620 else
621 {
622 // degenerate triangles won't be sent to rasterizer; just enable all edges
623 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
624 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
625 }
626
627 simdBBox bbox;
628
629 if (!triMask)
630 {
631 goto endBinTriangles;
632 }
633
634 // Calc bounding box of triangles
635 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
636
637 // determine if triangle falls between pixel centers and discard
638 // only discard for non-MSAA case and when conservative rast is disabled
639 // (xmin + 127) & ~255
640 // (xmax + 128) & ~255
641 if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
642 (!CT::IsConservativeT::value))
643 {
644 origTriMask = triMask;
645
646 int cullCenterMask;
647 {
648 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
649 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
650 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
651 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
652
653 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
654
655 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
656 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
657 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
658 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
659
660 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
661 vMaskV = _simd_or_si(vMaskH, vMaskV);
662 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
663 }
664
665 triMask &= ~cullCenterMask;
666
667 if (origTriMask ^ triMask)
668 {
669 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
670 }
671 }
672
673 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
674 // Gather the AOS effective scissor rects based on the per-prim VP index.
675 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
676 {
677 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
678 if (state.backendState.readViewportArrayIndex)
679 {
680 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
681 scisXmin, scisYmin, scisXmax, scisYmax);
682 }
683 else // broadcast fast path for non-VPAI case.
684 {
685 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
686 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
687 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
688 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
689 }
690
691 // Make triangle bbox inclusive
692 bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
693 bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
694
695 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
696 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
697 bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
698 bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
699 }
700
701 if (CT::IsConservativeT::value)
702 {
703 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
704 // some area. Bump the xmax/ymax edges out
705 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
706 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
707 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
708 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
709 }
710
711 // Cull tris completely outside scissor
712 {
713 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
714 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
715 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
716 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
717 triMask = triMask & ~maskOutsideScissor;
718 }
719
720 endBinTriangles:
721
722 // Send surviving triangles to the line or point binner based on fill mode
723 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
724 {
725 // Simple non-conformant wireframe mode, useful for debugging.
726 // Construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
727 simdvector line[2];
728 simdscalar recipW[2];
729 line[0] = tri[0];
730 line[1] = tri[1];
731 recipW[0] = vRecipW0;
732 recipW[1] = vRecipW1;
733 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
734
735 line[0] = tri[1];
736 line[1] = tri[2];
737 recipW[0] = vRecipW1;
738 recipW[1] = vRecipW2;
739 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
740
741 line[0] = tri[2];
742 line[1] = tri[0];
743 recipW[0] = vRecipW2;
744 recipW[1] = vRecipW0;
745 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
746
747 AR_END(FEBinTriangles, 1);
748 return;
749 }
750 else if (rastState.fillMode == SWR_FILLMODE_POINT)
751 {
752 // Bin 3 points
753 BinPostSetupPoints(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
754 BinPostSetupPoints(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
755 BinPostSetupPoints(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
756 return;
757 }
758
759 // Convert triangle bbox to macrotile units.
760 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
761 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
762 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
763 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
764
765 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
766 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
767 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
768 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
769 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
770
771 // transpose verts needed for backend
772 /// @todo modify BE to take non-transformed verts
773 simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
774 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
775 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
776 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
777 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
778
779 // store render target array index
780 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
781 if (state.backendState.readRenderTargetArrayIndex)
782 {
783 simdvector vRtai[3];
784 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
785 simdscalari vRtaii;
786 vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
787 _simd_store_si((simdscalari*)aRTAI, vRtaii);
788 }
789 else
790 {
791 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
792 }
793
794 // scan remaining valid triangles and bin each separately
795 while (_BitScanForward(&triIndex, triMask))
796 {
797 uint32_t linkageCount = state.backendState.numAttributes;
798 uint32_t numScalarAttribs = linkageCount * 4;
799
800 BE_WORK work;
801 work.type = DRAW;
802
803 bool isDegenerate;
804 if (CT::IsConservativeT::value)
805 {
806 // only rasterize valid edges if we have a degenerate primitive
807 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
808 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
809 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
810
811 // Degenerate triangles are required to be constant interpolated
812 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
813 }
814 else
815 {
816 isDegenerate = false;
817 work.pfnWork = pfnWork;
818 }
819
820 // Select attribute processor
821 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
822 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
823
824 TRIANGLE_WORK_DESC &desc = work.desc.tri;
825
826 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
827 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
828 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
829
830 auto pArena = pDC->pArena;
831 SWR_ASSERT(pArena != nullptr);
832
833 // store active attribs
834 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
835 desc.pAttribs = pAttribs;
836 desc.numAttribs = linkageCount;
837 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
838
839 // store triangle vertex data
840 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
841
842 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
843 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
844 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
845 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
846
847 // store user clip distances
848 if (rastState.clipDistanceMask)
849 {
850 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
851 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
852 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
853 }
854
855 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
856 {
857 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
858 {
859 #if KNOB_ENABLE_TOSS_POINTS
860 if (!KNOB_TOSS_SETUP_TRIS)
861 #endif
862 {
863 pTileMgr->enqueue(x, y, &work);
864 }
865 }
866 }
867 triMask &= ~(1 << triIndex);
868 }
869
870 AR_END(FEBinTriangles, 1);
871 }
872
873 #if USE_SIMD16_FRONTEND
874 template <typename CT>
875 void SIMDCALL BinTriangles_simd16(
876 DRAW_CONTEXT *pDC,
877 PA_STATE& pa,
878 uint32_t workerId,
879 simd16vector tri[3],
880 uint32_t triMask,
881 simd16scalari primID)
882 {
883 SWR_CONTEXT *pContext = pDC->pContext;
884
885 AR_BEGIN(FEBinTriangles, pDC->drawId);
886
887 const API_STATE& state = GetApiState(pDC);
888 const SWR_RASTSTATE& rastState = state.rastState;
889 const SWR_FRONTEND_STATE& feState = state.frontendState;
890
891 MacroTileMgr *pTileMgr = pDC->pTileMgr;
892
893 simd16scalar vRecipW0 = _simd16_set1_ps(1.0f);
894 simd16scalar vRecipW1 = _simd16_set1_ps(1.0f);
895 simd16scalar vRecipW2 = _simd16_set1_ps(1.0f);
896
897 simd16scalari viewportIdx = _simd16_set1_epi32(0);
898 if (state.backendState.readViewportArrayIndex)
899 {
900 simd16vector vpiAttrib[3];
901 pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
902
903 // OOB indices => forced to zero.
904 simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
905 vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
906 simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
907 simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
908 viewportIdx = _simd16_and_si(vClearMask, vpai);
909 }
910
911 if (feState.vpTransformDisable)
912 {
913 // RHW is passed in directly when VP transform is disabled
914 vRecipW0 = tri[0].v[3];
915 vRecipW1 = tri[1].v[3];
916 vRecipW2 = tri[2].v[3];
917 }
918 else
919 {
920 // Perspective divide
921 vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[0].w);
922 vRecipW1 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[1].w);
923 vRecipW2 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[2].w);
924
925 tri[0].v[0] = _simd16_mul_ps(tri[0].v[0], vRecipW0);
926 tri[1].v[0] = _simd16_mul_ps(tri[1].v[0], vRecipW1);
927 tri[2].v[0] = _simd16_mul_ps(tri[2].v[0], vRecipW2);
928
929 tri[0].v[1] = _simd16_mul_ps(tri[0].v[1], vRecipW0);
930 tri[1].v[1] = _simd16_mul_ps(tri[1].v[1], vRecipW1);
931 tri[2].v[1] = _simd16_mul_ps(tri[2].v[1], vRecipW2);
932
933 tri[0].v[2] = _simd16_mul_ps(tri[0].v[2], vRecipW0);
934 tri[1].v[2] = _simd16_mul_ps(tri[1].v[2], vRecipW1);
935 tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2);
936
937 // Viewport transform to screen space coords
938 if (state.backendState.readViewportArrayIndex)
939 {
940 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
941 }
942 else
943 {
944 viewportTransform<3>(tri, state.vpMatrices);
945 }
946 }
947
948 // Adjust for pixel center location
949 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
950
951 tri[0].x = _simd16_add_ps(tri[0].x, offset);
952 tri[0].y = _simd16_add_ps(tri[0].y, offset);
953
954 tri[1].x = _simd16_add_ps(tri[1].x, offset);
955 tri[1].y = _simd16_add_ps(tri[1].y, offset);
956
957 tri[2].x = _simd16_add_ps(tri[2].x, offset);
958 tri[2].y = _simd16_add_ps(tri[2].y, offset);
959
960 simd16scalari vXi[3], vYi[3];
961
962 // Set vXi, vYi to required fixed point precision
963 FPToFixedPoint(tri, vXi, vYi);
964
965 // triangle setup
966 simd16scalari vAi[3], vBi[3];
967 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
968
969 // determinant
970 simd16scalari vDet[2];
971 calcDeterminantIntVertical(vAi, vBi, vDet);
972
973 // cull zero area
974 uint32_t maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si())));
975 uint32_t maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si())));
976
977 uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
978
979 // don't cull degenerate triangles if we're conservatively rasterizing
980 uint32_t origTriMask = triMask;
981 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
982 {
983 triMask &= ~cullZeroAreaMask;
984 }
985
986 // determine front winding tris
987 // CW +det
988 // CCW det < 0;
989 // 0 area triangles are marked as backfacing regardless of winding order,
990 // which is required behavior for conservative rast and wireframe rendering
991 uint32_t frontWindingTris;
992 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
993 {
994 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[0], _simd16_setzero_si())));
995 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[1], _simd16_setzero_si())));
996 }
997 else
998 {
999 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0])));
1000 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1])));
1001 }
1002 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
1003
1004 // cull
1005 uint32_t cullTris;
1006 switch ((SWR_CULLMODE)rastState.cullMode)
1007 {
1008 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1009 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1010 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1011 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1012 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1013 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1014 }
1015
1016 triMask &= ~cullTris;
1017
1018 if (origTriMask ^ triMask)
1019 {
1020 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1021 }
1022
1023 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1024 // compute per tri backface
1025 uint32_t frontFaceMask = frontWindingTris;
1026 uint32_t *pPrimID = (uint32_t *)&primID;
1027 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1028 DWORD triIndex = 0;
1029
1030 uint32_t edgeEnable;
1031 PFN_WORK_FUNC pfnWork;
1032 if (CT::IsConservativeT::value)
1033 {
1034 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1035 // used to call the appropriate templated rasterizer function
1036 if (cullZeroAreaMask > 0)
1037 {
1038 // e0 = v1-v0
1039 const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]);
1040 const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]);
1041
1042 uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask)));
1043
1044 // e1 = v2-v1
1045 const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]);
1046 const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]);
1047
1048 uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask)));
1049
1050 // e2 = v0-v2
1051 // if v0 == v1 & v1 == v2, v0 == v2
1052 uint32_t e2Mask = e0Mask & e1Mask;
1053 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1054
1055 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1056 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1057 e0Mask = pdep_u32(e0Mask, 0x00249249);
1058
1059 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1060 e1Mask = pdep_u32(e1Mask, 0x00492492);
1061
1062 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1063 e2Mask = pdep_u32(e2Mask, 0x00924924);
1064
1065 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1066 }
1067 else
1068 {
1069 edgeEnable = 0x00FFFFFF;
1070 }
1071 }
1072 else
1073 {
1074 // degenerate triangles won't be sent to rasterizer; just enable all edges
1075 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1076 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
1077 }
1078
1079 simd16BBox bbox;
1080
1081 if (!triMask)
1082 {
1083 goto endBinTriangles;
1084 }
1085
1086 // Calc bounding box of triangles
1087 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1088
1089 // determine if triangle falls between pixel centers and discard
1090 // only discard for non-MSAA case and when conservative rast is disabled
1091 // (xmin + 127) & ~255
1092 // (xmax + 128) & ~255
1093 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
1094 (!CT::IsConservativeT::value))
1095 {
1096 origTriMask = triMask;
1097
1098 int cullCenterMask;
1099
1100 {
1101 simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127));
1102 xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255));
1103 simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128));
1104 xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255));
1105
1106 simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax);
1107
1108 simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127));
1109 ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255));
1110 simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128));
1111 ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255));
1112
1113 simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax);
1114
1115 vMaskV = _simd16_or_si(vMaskH, vMaskV);
1116 cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV));
1117 }
1118
1119 triMask &= ~cullCenterMask;
1120
1121 if (origTriMask ^ triMask)
1122 {
1123 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1124 }
1125 }
1126
1127 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1128 // Gather the AOS effective scissor rects based on the per-prim VP index.
1129 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1130 {
1131 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
1132
1133 if (state.backendState.readViewportArrayIndex)
1134 {
1135 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1136 scisXmin, scisYmin, scisXmax, scisYmax);
1137 }
1138 else // broadcast fast path for non-VPAI case.
1139 {
1140 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1141 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1142 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1143 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1144 }
1145
1146 // Make triangle bbox inclusive
1147 bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
1148 bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
1149
1150 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
1151 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
1152 bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
1153 bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
1154 }
1155
1156 if (CT::IsConservativeT::value)
1157 {
1158 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1159 // some area. Bump the xmax/ymax edges out
1160 simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax);
1161 bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom);
1162 simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax);
1163 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight);
1164 }
1165
1166 // Cull tris completely outside scissor
1167 {
1168 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
1169 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
1170 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
1171 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
1172 triMask = triMask & ~maskOutsideScissor;
1173 }
1174
1175 endBinTriangles:
1176
1177 // Send surviving triangles to the line or point binner based on fill mode
1178 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
1179 {
1180 // Simple non-conformant wireframe mode, useful for debugging
1181 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1182 simd16vector line[2];
1183 simd16scalar recipW[2];
1184 line[0] = tri[0];
1185 line[1] = tri[1];
1186 recipW[0] = vRecipW0;
1187 recipW[1] = vRecipW1;
1188 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1189
1190 line[0] = tri[1];
1191 line[1] = tri[2];
1192 recipW[0] = vRecipW1;
1193 recipW[1] = vRecipW2;
1194 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1195
1196 line[0] = tri[2];
1197 line[1] = tri[0];
1198 recipW[0] = vRecipW2;
1199 recipW[1] = vRecipW0;
1200 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1201
1202 AR_END(FEBinTriangles, 1);
1203 return;
1204 }
1205 else if (rastState.fillMode == SWR_FILLMODE_POINT)
1206 {
1207 // Bin 3 points
1208 BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
1209 BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
1210 BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
1211 return;
1212 }
1213
1214 // Convert triangle bbox to macrotile units.
1215 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1216 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1217 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1218 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1219
1220 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
1221
1222 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
1223 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
1224 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
1225 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
1226
1227 // transpose verts needed for backend
1228 /// @todo modify BE to take non-transformed verts
1229 simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1230 simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1231 simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1232 simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1233
1234 vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
1235 vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
1236 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0));
1237 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0));
1238
1239 vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1));
1240 vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1));
1241 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1));
1242 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1));
1243
1244 // store render target array index
1245 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1246 if (state.backendState.readRenderTargetArrayIndex)
1247 {
1248 simd16vector vRtai[3];
1249 pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
1250 simd16scalari vRtaii;
1251 vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1252 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1253 }
1254 else
1255 {
1256 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1257 }
1258
1259
1260 // scan remaining valid triangles and bin each separately
1261 while (_BitScanForward(&triIndex, triMask))
1262 {
1263 uint32_t linkageCount = state.backendState.numAttributes;
1264 uint32_t numScalarAttribs = linkageCount * 4;
1265
1266 BE_WORK work;
1267 work.type = DRAW;
1268
1269 bool isDegenerate;
1270 if (CT::IsConservativeT::value)
1271 {
1272 // only rasterize valid edges if we have a degenerate primitive
1273 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1274 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1275 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1276
1277 // Degenerate triangles are required to be constant interpolated
1278 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1279 }
1280 else
1281 {
1282 isDegenerate = false;
1283 work.pfnWork = pfnWork;
1284 }
1285
1286 // Select attribute processor
1287 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1288 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1289
1290 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1291
1292 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1293 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1294 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1295
1296 auto pArena = pDC->pArena;
1297 SWR_ASSERT(pArena != nullptr);
1298
1299 // store active attribs
1300 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1301 desc.pAttribs = pAttribs;
1302 desc.numAttribs = linkageCount;
1303 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1304
1305 // store triangle vertex data
1306 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1307
1308 {
1309 const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
1310 const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH
1311
1312 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
1313 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
1314 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
1315 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
1316 }
1317
1318 // store user clip distances
1319 if (rastState.clipDistanceMask)
1320 {
1321 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1322 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1323 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1324 }
1325
1326 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1327 {
1328 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1329 {
1330 #if KNOB_ENABLE_TOSS_POINTS
1331 if (!KNOB_TOSS_SETUP_TRIS)
1332 #endif
1333 {
1334 pTileMgr->enqueue(x, y, &work);
1335 }
1336 }
1337 }
1338
1339 triMask &= ~(1 << triIndex);
1340 }
1341
1342 AR_END(FEBinTriangles, 1);
1343 }
1344
1345 #endif
1346 struct FEBinTrianglesChooser
1347 {
1348 typedef PFN_PROCESS_PRIMS FuncType;
1349
1350 template <typename... ArgsB>
1351 static FuncType GetFunc()
1352 {
1353 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1354 }
1355 };
1356
1357 // Selector for correct templated BinTrinagles function
1358 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1359 {
1360 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1361 }
1362
1363 #if USE_SIMD16_FRONTEND
1364 struct FEBinTrianglesChooser_simd16
1365 {
1366 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1367
1368 template <typename... ArgsB>
1369 static FuncType GetFunc()
1370 {
1371 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1372 }
1373 };
1374
1375 // Selector for correct templated BinTrinagles function
1376 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1377 {
1378 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1379 }
1380
1381 #endif
1382
1383 void BinPostSetupPoints(
1384 DRAW_CONTEXT *pDC,
1385 PA_STATE& pa,
1386 uint32_t workerId,
1387 simdvector prim[],
1388 uint32_t primMask,
1389 simdscalari primID,
1390 simdscalari viewportIdx)
1391 {
1392 SWR_CONTEXT *pContext = pDC->pContext;
1393
1394 AR_BEGIN(FEBinPoints, pDC->drawId);
1395
1396 simdvector& primVerts = prim[0];
1397
1398 const API_STATE& state = GetApiState(pDC);
1399 const SWR_RASTSTATE& rastState = state.rastState;
1400 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1401
1402 // Select attribute processor
1403 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1404 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1405
1406 // convert to fixed point
1407 simdscalari vXi, vYi;
1408 vXi = fpToFixedPointVertical(primVerts.x);
1409 vYi = fpToFixedPointVertical(primVerts.y);
1410
1411 if (CanUseSimplePoints(pDC))
1412 {
1413 // adjust for ymin-xmin rule
1414 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1415 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1416
1417 // cull points off the ymin-xmin edge of the viewport
1418 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1419 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1420
1421 // compute macro tile coordinates
1422 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1423 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1424
1425 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1426 _simd_store_si((simdscalari*)aMacroX, macroX);
1427 _simd_store_si((simdscalari*)aMacroY, macroY);
1428
1429 // compute raster tile coordinates
1430 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1431 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1432
1433 // compute raster tile relative x,y for coverage mask
1434 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1435 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1436
1437 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1438 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1439
1440 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1441 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1442 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1443 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1444
1445 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1446 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1447 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1448 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1449
1450 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1451 _simd_store_ps((float*)aZ, primVerts.z);
1452
1453 // store render target array index
1454 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1455 if (state.backendState.readRenderTargetArrayIndex)
1456 {
1457 simdvector vRtai;
1458 pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
1459 simdscalari vRtaii = _simd_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
1460 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1461 }
1462 else
1463 {
1464 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1465 }
1466
1467 uint32_t *pPrimID = (uint32_t *)&primID;
1468 DWORD primIndex = 0;
1469
1470 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1471
1472 // scan remaining valid triangles and bin each separately
1473 while (_BitScanForward(&primIndex, primMask))
1474 {
1475 uint32_t linkageCount = backendState.numAttributes;
1476 uint32_t numScalarAttribs = linkageCount * 4;
1477
1478 BE_WORK work;
1479 work.type = DRAW;
1480
1481 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1482
1483 // points are always front facing
1484 desc.triFlags.frontFacing = 1;
1485 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1486 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1487
1488 work.pfnWork = RasterizeSimplePoint;
1489
1490 auto pArena = pDC->pArena;
1491 SWR_ASSERT(pArena != nullptr);
1492
1493 // store attributes
1494 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1495 desc.pAttribs = pAttribs;
1496 desc.numAttribs = linkageCount;
1497
1498 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1499
1500 // store raster tile aligned x, y, perspective correct z
1501 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1502 desc.pTriBuffer = pTriBuffer;
1503 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1504 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1505 *pTriBuffer = aZ[primIndex];
1506
1507 uint32_t tX = aTileRelativeX[primIndex];
1508 uint32_t tY = aTileRelativeY[primIndex];
1509
1510 // pack the relative x,y into the coverageMask, the rasterizer will
1511 // generate the true coverage mask from it
1512 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1513
1514 // bin it
1515 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1516 #if KNOB_ENABLE_TOSS_POINTS
1517 if (!KNOB_TOSS_SETUP_TRIS)
1518 #endif
1519 {
1520 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1521 }
1522 primMask &= ~(1 << primIndex);
1523 }
1524 }
1525 else
1526 {
1527 // non simple points need to be potentially binned to multiple macro tiles
1528 simdscalar vPointSize;
1529 if (rastState.pointParam)
1530 {
1531 simdvector size[3];
1532 pa.Assemble(VERTEX_SGV_SLOT, size);
1533 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1534 }
1535 else
1536 {
1537 vPointSize = _simd_set1_ps(rastState.pointSize);
1538 }
1539
1540 // bloat point to bbox
1541 simdBBox bbox;
1542 bbox.xmin = bbox.xmax = vXi;
1543 bbox.ymin = bbox.ymax = vYi;
1544
1545 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
1546 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1547 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1548 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1549 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1550 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1551
1552 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1553 // Gather the AOS effective scissor rects based on the per-prim VP index.
1554 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1555 {
1556 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1557 if (state.backendState.readViewportArrayIndex)
1558 {
1559 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1560 scisXmin, scisYmin, scisXmax, scisYmax);
1561 }
1562 else // broadcast fast path for non-VPAI case.
1563 {
1564 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1565 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1566 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1567 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1568 }
1569
1570 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1571 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1572 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1573 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1574 }
1575
1576 // Cull bloated points completely outside scissor
1577 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1578 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1579 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1580 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1581 primMask = primMask & ~maskOutsideScissor;
1582
1583 // Convert bbox to macrotile units.
1584 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1585 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1586 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1587 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1588
1589 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1590 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1591 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1592 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1593 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1594
1595 // store render target array index
1596 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1597 if (state.backendState.readRenderTargetArrayIndex)
1598 {
1599 simdvector vRtai[2];
1600 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1601 simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1602 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1603 }
1604 else
1605 {
1606 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1607 }
1608
1609 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
1610 _simd_store_ps((float*)aPointSize, vPointSize);
1611
1612 uint32_t *pPrimID = (uint32_t *)&primID;
1613
1614 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
1615 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
1616 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
1617
1618 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
1619 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
1620 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
1621
1622 // scan remaining valid prims and bin each separately
1623 const SWR_BACKEND_STATE& backendState = state.backendState;
1624 DWORD primIndex;
1625 while (_BitScanForward(&primIndex, primMask))
1626 {
1627 uint32_t linkageCount = backendState.numAttributes;
1628 uint32_t numScalarAttribs = linkageCount * 4;
1629
1630 BE_WORK work;
1631 work.type = DRAW;
1632
1633 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1634
1635 desc.triFlags.frontFacing = 1;
1636 desc.triFlags.pointSize = aPointSize[primIndex];
1637 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1638 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1639
1640 work.pfnWork = RasterizeTriPoint;
1641
1642 auto pArena = pDC->pArena;
1643 SWR_ASSERT(pArena != nullptr);
1644
1645 // store active attribs
1646 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1647 desc.numAttribs = linkageCount;
1648 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1649
1650 // store point vertex data
1651 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1652 desc.pTriBuffer = pTriBuffer;
1653 *pTriBuffer++ = aPrimVertsX[primIndex];
1654 *pTriBuffer++ = aPrimVertsY[primIndex];
1655 *pTriBuffer = aPrimVertsZ[primIndex];
1656
1657 // store user clip distances
1658 if (rastState.clipDistanceMask)
1659 {
1660 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1661 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1662 float dists[8];
1663 float one = 1.0f;
1664 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
1665 for (uint32_t i = 0; i < numClipDist; i++) {
1666 desc.pUserClipBuffer[3*i + 0] = 0.0f;
1667 desc.pUserClipBuffer[3*i + 1] = 0.0f;
1668 desc.pUserClipBuffer[3*i + 2] = dists[i];
1669 }
1670 }
1671
1672 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1673 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1674 {
1675 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1676 {
1677 #if KNOB_ENABLE_TOSS_POINTS
1678 if (!KNOB_TOSS_SETUP_TRIS)
1679 #endif
1680 {
1681 pTileMgr->enqueue(x, y, &work);
1682 }
1683 }
1684 }
1685
1686 primMask &= ~(1 << primIndex);
1687 }
1688 }
1689
1690 AR_END(FEBinPoints, 1);
1691 }
1692
1693 //////////////////////////////////////////////////////////////////////////
1694 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1695 /// @param pDC - pointer to draw context.
1696 /// @param pa - The primitive assembly object.
1697 /// @param workerId - thread's worker id. Even thread has a unique id.
1698 /// @param tri - Contains point position data for SIMDs worth of points.
1699 /// @param primID - Primitive ID for each point.
1700 void BinPoints(
1701 DRAW_CONTEXT *pDC,
1702 PA_STATE& pa,
1703 uint32_t workerId,
1704 simdvector prim[3],
1705 uint32_t primMask,
1706 simdscalari primID)
1707 {
1708 simdvector& primVerts = prim[0];
1709
1710 const API_STATE& state = GetApiState(pDC);
1711 const SWR_FRONTEND_STATE& feState = state.frontendState;
1712 const SWR_RASTSTATE& rastState = state.rastState;
1713
1714 // Read back viewport index if required
1715 simdscalari viewportIdx = _simd_set1_epi32(0);
1716 if (state.backendState.readViewportArrayIndex)
1717 {
1718 simdvector vpiAttrib[1];
1719 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1720 simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1721
1722 // OOB indices => forced to zero.
1723 vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
1724 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1725 simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
1726 viewportIdx = _simd_and_si(vClearMask, vpai);
1727 }
1728
1729 if (!feState.vpTransformDisable)
1730 {
1731 // perspective divide
1732 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1733 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1734 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1735 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1736
1737 // viewport transform to screen coords
1738 if (state.backendState.readViewportArrayIndex)
1739 {
1740 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
1741 }
1742 else
1743 {
1744 viewportTransform<1>(&primVerts, state.vpMatrices);
1745 }
1746 }
1747
1748 // adjust for pixel center location
1749 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1750 primVerts.x = _simd_add_ps(primVerts.x, offset);
1751 primVerts.y = _simd_add_ps(primVerts.y, offset);
1752
1753 BinPostSetupPoints(
1754 pDC,
1755 pa,
1756 workerId,
1757 prim,
1758 primMask,
1759 primID,
1760 viewportIdx);
1761 }
1762
1763 #if USE_SIMD16_FRONTEND
1764 void BinPostSetupPoints_simd16(
1765 DRAW_CONTEXT *pDC,
1766 PA_STATE& pa,
1767 uint32_t workerId,
1768 simd16vector prim[],
1769 uint32_t primMask,
1770 simd16scalari primID,
1771 simd16scalari viewportIdx)
1772 {
1773 SWR_CONTEXT *pContext = pDC->pContext;
1774
1775 AR_BEGIN(FEBinPoints, pDC->drawId);
1776
1777 simd16vector& primVerts = prim[0];
1778
1779 const API_STATE& state = GetApiState(pDC);
1780 const SWR_RASTSTATE& rastState = state.rastState;
1781 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1782
1783 // Select attribute processor
1784 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1785 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1786
1787 // convert to fixed point
1788 simd16scalari vXi, vYi;
1789
1790 vXi = fpToFixedPointVertical(primVerts.x);
1791 vYi = fpToFixedPointVertical(primVerts.y);
1792
1793 if (CanUseSimplePoints(pDC))
1794 {
1795 // adjust for ymin-xmin rule
1796 vXi = _simd16_sub_epi32(vXi, _simd16_set1_epi32(1));
1797 vYi = _simd16_sub_epi32(vYi, _simd16_set1_epi32(1));
1798
1799 // cull points off the ymin-xmin edge of the viewport
1800 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi));
1801 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi));
1802
1803 // compute macro tile coordinates
1804 simd16scalari macroX = _simd16_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1805 simd16scalari macroY = _simd16_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1806
1807 OSALIGNSIMD16(uint32_t) aMacroX[KNOB_SIMD16_WIDTH], aMacroY[KNOB_SIMD16_WIDTH];
1808
1809 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroX), macroX);
1810 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroY), macroY);
1811
1812 // compute raster tile coordinates
1813 simd16scalari rasterX = _simd16_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1814 simd16scalari rasterY = _simd16_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1815
1816 // compute raster tile relative x,y for coverage mask
1817 simd16scalari tileAlignedX = _simd16_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1818 simd16scalari tileAlignedY = _simd16_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1819
1820 simd16scalari tileRelativeX = _simd16_sub_epi32(_simd16_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1821 simd16scalari tileRelativeY = _simd16_sub_epi32(_simd16_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1822
1823 OSALIGNSIMD16(uint32_t) aTileRelativeX[KNOB_SIMD16_WIDTH];
1824 OSALIGNSIMD16(uint32_t) aTileRelativeY[KNOB_SIMD16_WIDTH];
1825
1826 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeX), tileRelativeX);
1827 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeY), tileRelativeY);
1828
1829 OSALIGNSIMD16(uint32_t) aTileAlignedX[KNOB_SIMD16_WIDTH];
1830 OSALIGNSIMD16(uint32_t) aTileAlignedY[KNOB_SIMD16_WIDTH];
1831
1832 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedX), tileAlignedX);
1833 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedY), tileAlignedY);
1834
1835 OSALIGNSIMD16(float) aZ[KNOB_SIMD16_WIDTH];
1836 _simd16_store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1837
1838 // store render target array index
1839 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1840 if (state.backendState.readRenderTargetArrayIndex)
1841 {
1842 simd16vector vRtai;
1843 pa.Assemble_simd16(VERTEX_SGV_SLOT, &vRtai);
1844 simd16scalari vRtaii = _simd16_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
1845 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1846 }
1847 else
1848 {
1849 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1850 }
1851
1852 uint32_t *pPrimID = (uint32_t *)&primID;
1853 DWORD primIndex = 0;
1854
1855 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1856
1857 // scan remaining valid triangles and bin each separately
1858 while (_BitScanForward(&primIndex, primMask))
1859 {
1860 uint32_t linkageCount = backendState.numAttributes;
1861 uint32_t numScalarAttribs = linkageCount * 4;
1862
1863 BE_WORK work;
1864 work.type = DRAW;
1865
1866 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1867
1868 // points are always front facing
1869 desc.triFlags.frontFacing = 1;
1870 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1871 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1872
1873 work.pfnWork = RasterizeSimplePoint;
1874
1875 auto pArena = pDC->pArena;
1876 SWR_ASSERT(pArena != nullptr);
1877
1878 // store attributes
1879 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1880 desc.pAttribs = pAttribs;
1881 desc.numAttribs = linkageCount;
1882
1883 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1884
1885 // store raster tile aligned x, y, perspective correct z
1886 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1887 desc.pTriBuffer = pTriBuffer;
1888 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1889 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1890 *pTriBuffer = aZ[primIndex];
1891
1892 uint32_t tX = aTileRelativeX[primIndex];
1893 uint32_t tY = aTileRelativeY[primIndex];
1894
1895 // pack the relative x,y into the coverageMask, the rasterizer will
1896 // generate the true coverage mask from it
1897 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1898
1899 // bin it
1900 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1901 #if KNOB_ENABLE_TOSS_POINTS
1902 if (!KNOB_TOSS_SETUP_TRIS)
1903 #endif
1904 {
1905 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1906 }
1907
1908 primMask &= ~(1 << primIndex);
1909 }
1910 }
1911 else
1912 {
1913 // non simple points need to be potentially binned to multiple macro tiles
1914 simd16scalar vPointSize;
1915
1916 if (rastState.pointParam)
1917 {
1918 simd16vector size[3];
1919 pa.Assemble_simd16(VERTEX_SGV_SLOT, size);
1920 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1921 }
1922 else
1923 {
1924 vPointSize = _simd16_set1_ps(rastState.pointSize);
1925 }
1926
1927 // bloat point to bbox
1928 simd16BBox bbox;
1929
1930 bbox.xmin = bbox.xmax = vXi;
1931 bbox.ymin = bbox.ymax = vYi;
1932
1933 simd16scalar vHalfWidth = _simd16_mul_ps(vPointSize, _simd16_set1_ps(0.5f));
1934 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1935
1936 bbox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
1937 bbox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
1938 bbox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
1939 bbox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
1940
1941 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1942 // Gather the AOS effective scissor rects based on the per-prim VP index.
1943 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1944 {
1945 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
1946 if (state.backendState.readViewportArrayIndex)
1947 {
1948 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1949 scisXmin, scisYmin, scisXmax, scisYmax);
1950 }
1951 else // broadcast fast path for non-VPAI case.
1952 {
1953 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1954 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1955 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1956 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1957 }
1958
1959 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
1960 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
1961 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
1962 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
1963 }
1964
1965 // Cull bloated points completely outside scissor
1966 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
1967 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
1968 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
1969 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
1970 primMask = primMask & ~maskOutsideScissor;
1971
1972 // Convert bbox to macrotile units.
1973 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1974 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1975 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1976 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1977
1978 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
1979
1980 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
1981 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
1982 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
1983 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
1984
1985 // store render target array index
1986 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1987 if (state.backendState.readRenderTargetArrayIndex)
1988 {
1989 simd16vector vRtai[2];
1990 pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
1991 simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1992 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1993 }
1994 else
1995 {
1996 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1997 }
1998
1999 OSALIGNSIMD16(float) aPointSize[KNOB_SIMD16_WIDTH];
2000 _simd16_store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
2001
2002 uint32_t *pPrimID = (uint32_t *)&primID;
2003
2004 OSALIGNSIMD16(float) aPrimVertsX[KNOB_SIMD16_WIDTH];
2005 OSALIGNSIMD16(float) aPrimVertsY[KNOB_SIMD16_WIDTH];
2006 OSALIGNSIMD16(float) aPrimVertsZ[KNOB_SIMD16_WIDTH];
2007
2008 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
2009 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
2010 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
2011
2012 // scan remaining valid prims and bin each separately
2013 const SWR_BACKEND_STATE& backendState = state.backendState;
2014 DWORD primIndex;
2015 while (_BitScanForward(&primIndex, primMask))
2016 {
2017 uint32_t linkageCount = backendState.numAttributes;
2018 uint32_t numScalarAttribs = linkageCount * 4;
2019
2020 BE_WORK work;
2021 work.type = DRAW;
2022
2023 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2024
2025 desc.triFlags.frontFacing = 1;
2026 desc.triFlags.pointSize = aPointSize[primIndex];
2027 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2028 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2029
2030 work.pfnWork = RasterizeTriPoint;
2031
2032 auto pArena = pDC->pArena;
2033 SWR_ASSERT(pArena != nullptr);
2034
2035 // store active attribs
2036 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2037 desc.numAttribs = linkageCount;
2038 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2039
2040 // store point vertex data
2041 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2042 desc.pTriBuffer = pTriBuffer;
2043 *pTriBuffer++ = aPrimVertsX[primIndex];
2044 *pTriBuffer++ = aPrimVertsY[primIndex];
2045 *pTriBuffer = aPrimVertsZ[primIndex];
2046
2047 // store user clip distances
2048 if (rastState.clipDistanceMask)
2049 {
2050 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2051 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2052 float dists[8];
2053 float one = 1.0f;
2054 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
2055 for (uint32_t i = 0; i < numClipDist; i++) {
2056 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
2057 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
2058 desc.pUserClipBuffer[3 * i + 2] = dists[i];
2059 }
2060 }
2061
2062 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2063 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2064 {
2065 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2066 {
2067 #if KNOB_ENABLE_TOSS_POINTS
2068 if (!KNOB_TOSS_SETUP_TRIS)
2069 #endif
2070 {
2071 pTileMgr->enqueue(x, y, &work);
2072 }
2073 }
2074 }
2075
2076 primMask &= ~(1 << primIndex);
2077 }
2078 }
2079
2080 AR_END(FEBinPoints, 1);
2081 }
2082
2083 void SIMDCALL BinPoints_simd16(
2084 DRAW_CONTEXT *pDC,
2085 PA_STATE& pa,
2086 uint32_t workerId,
2087 simd16vector prim[3],
2088 uint32_t primMask,
2089 simd16scalari primID)
2090 {
2091 simd16vector& primVerts = prim[0];
2092
2093 const API_STATE& state = GetApiState(pDC);
2094 const SWR_FRONTEND_STATE& feState = state.frontendState;
2095 const SWR_RASTSTATE& rastState = state.rastState;
2096
2097 // Read back viewport index if required
2098 simd16scalari viewportIdx = _simd16_set1_epi32(0);
2099 if (state.backendState.readViewportArrayIndex)
2100 {
2101 simd16vector vpiAttrib[1];
2102 pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
2103
2104 // OOB indices => forced to zero.
2105 simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
2106 vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
2107 simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2108 simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
2109 viewportIdx = _simd16_and_si(vClearMask, vpai);
2110 }
2111
2112 if (!feState.vpTransformDisable)
2113 {
2114 // perspective divide
2115 simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w);
2116
2117 primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0);
2118 primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0);
2119 primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
2120
2121 // viewport transform to screen coords
2122 if (state.backendState.readViewportArrayIndex)
2123 {
2124 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2125 }
2126 else
2127 {
2128 viewportTransform<1>(&primVerts, state.vpMatrices);
2129 }
2130 }
2131
2132 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
2133
2134 primVerts.x = _simd16_add_ps(primVerts.x, offset);
2135 primVerts.y = _simd16_add_ps(primVerts.y, offset);
2136
2137 BinPostSetupPoints_simd16(
2138 pDC,
2139 pa,
2140 workerId,
2141 prim,
2142 primMask,
2143 primID,
2144 viewportIdx);
2145 }
2146
2147 #endif
2148 //////////////////////////////////////////////////////////////////////////
2149 /// @brief Bin SIMD lines to the backend.
2150 /// @param pDC - pointer to draw context.
2151 /// @param pa - The primitive assembly object.
2152 /// @param workerId - thread's worker id. Even thread has a unique id.
2153 /// @param tri - Contains line position data for SIMDs worth of points.
2154 /// @param primID - Primitive ID for each line.
2155 /// @param viewportIdx - Viewport Array Index for each line.
2156 void BinPostSetupLines(
2157 DRAW_CONTEXT *pDC,
2158 PA_STATE& pa,
2159 uint32_t workerId,
2160 simdvector prim[],
2161 simdscalar recipW[],
2162 uint32_t primMask,
2163 simdscalari primID,
2164 simdscalari viewportIdx)
2165 {
2166 SWR_CONTEXT *pContext = pDC->pContext;
2167
2168 AR_BEGIN(FEBinLines, pDC->drawId);
2169
2170 const API_STATE& state = GetApiState(pDC);
2171 const SWR_RASTSTATE& rastState = state.rastState;
2172
2173 // Select attribute processor
2174 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2175 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2176
2177 simdscalar& vRecipW0 = recipW[0];
2178 simdscalar& vRecipW1 = recipW[1];
2179
2180 simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2181
2182 // convert to fixed point
2183 simdscalari vXi[2], vYi[2];
2184 vXi[0] = fpToFixedPointVertical(prim[0].x);
2185 vYi[0] = fpToFixedPointVertical(prim[0].y);
2186 vXi[1] = fpToFixedPointVertical(prim[1].x);
2187 vYi[1] = fpToFixedPointVertical(prim[1].y);
2188
2189 // compute x-major vs y-major mask
2190 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2191 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2192 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2193 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2194
2195 // cull zero-length lines
2196 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2197 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2198
2199 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2200
2201 uint32_t *pPrimID = (uint32_t *)&primID;
2202 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2203
2204 simdscalar vUnused = _simd_setzero_ps();
2205
2206 // Calc bounding box of lines
2207 simdBBox bbox;
2208 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2209 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2210 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2211 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2212
2213 // bloat bbox by line width along minor axis
2214 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2215 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2216 simdBBox bloatBox;
2217 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2218 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2219 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2220 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2221
2222 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2223 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2224 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2225 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2226
2227 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2228 {
2229 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2230 if (state.backendState.readViewportArrayIndex)
2231 {
2232 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2233 scisXmin, scisYmin, scisXmax, scisYmax);
2234 }
2235 else // broadcast fast path for non-VPAI case.
2236 {
2237 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2238 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2239 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2240 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2241 }
2242
2243 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2244 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2245 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2246 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2247 }
2248
2249 // Cull prims completely outside scissor
2250 {
2251 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2252 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2253 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2254 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2255 primMask = primMask & ~maskOutsideScissor;
2256 }
2257
2258 if (!primMask)
2259 {
2260 goto endBinLines;
2261 }
2262
2263 // Convert triangle bbox to macrotile units.
2264 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2265 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2266 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2267 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2268
2269 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2270 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2271 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2272 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2273 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2274
2275 // transpose verts needed for backend
2276 /// @todo modify BE to take non-transformed verts
2277 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2278 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2279 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2280 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2281
2282 // store render target array index
2283 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2284 if (state.backendState.readRenderTargetArrayIndex)
2285 {
2286 simdvector vRtai[2];
2287 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
2288 simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
2289 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2290 }
2291 else
2292 {
2293 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2294 }
2295
2296 // scan remaining valid prims and bin each separately
2297 DWORD primIndex;
2298 while (_BitScanForward(&primIndex, primMask))
2299 {
2300 uint32_t linkageCount = state.backendState.numAttributes;
2301 uint32_t numScalarAttribs = linkageCount * 4;
2302
2303 BE_WORK work;
2304 work.type = DRAW;
2305
2306 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2307
2308 desc.triFlags.frontFacing = 1;
2309 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2310 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2311 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2312
2313 work.pfnWork = RasterizeLine;
2314
2315 auto pArena = pDC->pArena;
2316 SWR_ASSERT(pArena != nullptr);
2317
2318 // store active attribs
2319 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2320 desc.numAttribs = linkageCount;
2321 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2322
2323 // store line vertex data
2324 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2325 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2326 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2327 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2328 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2329
2330 // store user clip distances
2331 if (rastState.clipDistanceMask)
2332 {
2333 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2334 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2335 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2336 }
2337
2338 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2339 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2340 {
2341 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2342 {
2343 #if KNOB_ENABLE_TOSS_POINTS
2344 if (!KNOB_TOSS_SETUP_TRIS)
2345 #endif
2346 {
2347 pTileMgr->enqueue(x, y, &work);
2348 }
2349 }
2350 }
2351
2352 primMask &= ~(1 << primIndex);
2353 }
2354
2355 endBinLines:
2356
2357 AR_END(FEBinLines, 1);
2358 }
2359
2360 #if USE_SIMD16_FRONTEND
2361 void BinPostSetupLines_simd16(
2362 DRAW_CONTEXT *pDC,
2363 PA_STATE& pa,
2364 uint32_t workerId,
2365 simd16vector prim[],
2366 simd16scalar recipW[],
2367 uint32_t primMask,
2368 simd16scalari primID,
2369 simd16scalari viewportIdx)
2370 {
2371 SWR_CONTEXT *pContext = pDC->pContext;
2372
2373 AR_BEGIN(FEBinLines, pDC->drawId);
2374
2375 const API_STATE& state = GetApiState(pDC);
2376 const SWR_RASTSTATE& rastState = state.rastState;
2377
2378 // Select attribute processor
2379 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2380 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2381
2382 simd16scalar& vRecipW0 = recipW[0];
2383 simd16scalar& vRecipW1 = recipW[1];
2384
2385 // convert to fixed point
2386 simd16scalari vXi[2], vYi[2];
2387
2388 vXi[0] = fpToFixedPointVertical(prim[0].x);
2389 vYi[0] = fpToFixedPointVertical(prim[0].y);
2390 vXi[1] = fpToFixedPointVertical(prim[1].x);
2391 vYi[1] = fpToFixedPointVertical(prim[1].y);
2392
2393 // compute x-major vs y-major mask
2394 simd16scalari xLength = _simd16_abs_epi32(_simd16_sub_epi32(vXi[0], vXi[1]));
2395 simd16scalari yLength = _simd16_abs_epi32(_simd16_sub_epi32(vYi[0], vYi[1]));
2396 simd16scalar vYmajorMask = _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength, xLength));
2397 uint32_t yMajorMask = _simd16_movemask_ps(vYmajorMask);
2398
2399 // cull zero-length lines
2400 simd16scalari vZeroLengthMask = _simd16_cmpeq_epi32(xLength, _simd16_setzero_si());
2401 vZeroLengthMask = _simd16_and_si(vZeroLengthMask, _simd16_cmpeq_epi32(yLength, _simd16_setzero_si()));
2402
2403 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask));
2404
2405 uint32_t *pPrimID = (uint32_t *)&primID;
2406 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2407
2408 // Calc bounding box of lines
2409 simd16BBox bbox;
2410 bbox.xmin = _simd16_min_epi32(vXi[0], vXi[1]);
2411 bbox.xmax = _simd16_max_epi32(vXi[0], vXi[1]);
2412 bbox.ymin = _simd16_min_epi32(vYi[0], vYi[1]);
2413 bbox.ymax = _simd16_max_epi32(vYi[0], vYi[1]);
2414
2415 // bloat bbox by line width along minor axis
2416 simd16scalar vHalfWidth = _simd16_set1_ps(rastState.lineWidth / 2.0f);
2417 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2418
2419 simd16BBox bloatBox;
2420
2421 bloatBox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
2422 bloatBox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
2423 bloatBox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
2424 bloatBox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
2425
2426 bbox.xmin = _simd16_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2427 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2428 bbox.ymin = _simd16_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2429 bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2430
2431 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2432 {
2433 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
2434
2435 if (state.backendState.readViewportArrayIndex)
2436 {
2437 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2438 scisXmin, scisYmin, scisXmax, scisYmax);
2439 }
2440 else // broadcast fast path for non-VPAI case.
2441 {
2442 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2443 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2444 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2445 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2446 }
2447
2448 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
2449 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
2450 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
2451 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
2452 }
2453
2454 // Cull prims completely outside scissor
2455 {
2456 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
2457 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
2458 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
2459 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
2460 primMask = primMask & ~maskOutsideScissor;
2461 }
2462
2463 const simdscalar unused = _simd_setzero_ps();
2464
2465 // transpose verts needed for backend
2466 /// @todo modify BE to take non-transformed verts
2467 simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2468 simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2469 simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2470 simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2471
2472 if (!primMask)
2473 {
2474 goto endBinLines;
2475 }
2476
2477 // Convert triangle bbox to macrotile units.
2478 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2479 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2480 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2481 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2482
2483 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
2484
2485 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
2486 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
2487 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
2488 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
2489
2490 vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
2491 vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
2492 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused);
2493 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), unused);
2494
2495 vTranspose3x8(vHorizX[1], _simd16_extract_ps(prim[0].x, 1), _simd16_extract_ps(prim[1].x, 1), unused);
2496 vTranspose3x8(vHorizY[1], _simd16_extract_ps(prim[0].y, 1), _simd16_extract_ps(prim[1].y, 1), unused);
2497 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(prim[0].z, 1), _simd16_extract_ps(prim[1].z, 1), unused);
2498 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), unused);
2499
2500 // store render target array index
2501 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
2502 if (state.backendState.readRenderTargetArrayIndex)
2503 {
2504 simd16vector vRtai[2];
2505 pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
2506 simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
2507 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
2508 }
2509 else
2510 {
2511 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
2512 }
2513
2514 // scan remaining valid prims and bin each separately
2515 DWORD primIndex;
2516 while (_BitScanForward(&primIndex, primMask))
2517 {
2518 uint32_t linkageCount = state.backendState.numAttributes;
2519 uint32_t numScalarAttribs = linkageCount * 4;
2520
2521 BE_WORK work;
2522 work.type = DRAW;
2523
2524 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2525
2526 desc.triFlags.frontFacing = 1;
2527 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2528 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2529 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2530
2531 work.pfnWork = RasterizeLine;
2532
2533 auto pArena = pDC->pArena;
2534 SWR_ASSERT(pArena != nullptr);
2535
2536 // store active attribs
2537 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2538 desc.numAttribs = linkageCount;
2539 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2540
2541 // store line vertex data
2542 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2543
2544 {
2545 const uint32_t i = primIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
2546 const uint32_t j = primIndex & 7; // triIndex % KNOB_SIMD_WIDTH
2547
2548 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
2549 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
2550 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
2551 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
2552 }
2553
2554 // store user clip distances
2555 if (rastState.clipDistanceMask)
2556 {
2557 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2558 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2559 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2560 }
2561
2562 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2563 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2564 {
2565 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2566 {
2567 #if KNOB_ENABLE_TOSS_POINTS
2568 if (!KNOB_TOSS_SETUP_TRIS)
2569 #endif
2570 {
2571 pTileMgr->enqueue(x, y, &work);
2572 }
2573 }
2574 }
2575
2576 primMask &= ~(1 << primIndex);
2577 }
2578
2579 endBinLines:
2580
2581 AR_END(FEBinLines, 1);
2582 }
2583
2584 #endif
2585 //////////////////////////////////////////////////////////////////////////
2586 /// @brief Bin SIMD lines to the backend.
2587 /// @param pDC - pointer to draw context.
2588 /// @param pa - The primitive assembly object.
2589 /// @param workerId - thread's worker id. Even thread has a unique id.
2590 /// @param tri - Contains line position data for SIMDs worth of points.
2591 /// @param primID - Primitive ID for each line.
2592 /// @param viewportIdx - Viewport Array Index for each line.
2593 void BinLines(
2594 DRAW_CONTEXT *pDC,
2595 PA_STATE& pa,
2596 uint32_t workerId,
2597 simdvector prim[],
2598 uint32_t primMask,
2599 simdscalari primID)
2600 {
2601 const API_STATE& state = GetApiState(pDC);
2602 const SWR_RASTSTATE& rastState = state.rastState;
2603 const SWR_FRONTEND_STATE& feState = state.frontendState;
2604
2605 simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
2606
2607 simdscalari viewportIdx = _simd_set1_epi32(0);
2608 if (state.backendState.readViewportArrayIndex)
2609 {
2610 simdvector vpiAttrib[2];
2611 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
2612 simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
2613 vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
2614
2615 // OOB indices => forced to zero.
2616 simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2617 simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
2618 viewportIdx = _simd_and_si(vClearMask, vpai);
2619 }
2620
2621 if (!feState.vpTransformDisable)
2622 {
2623 // perspective divide
2624 vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2625 vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2626
2627 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
2628 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
2629
2630 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
2631 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
2632
2633 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
2634 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
2635
2636 // viewport transform to screen coords
2637 if (state.backendState.readViewportArrayIndex)
2638 {
2639 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2640 }
2641 else
2642 {
2643 viewportTransform<2>(prim, state.vpMatrices);
2644 }
2645 }
2646
2647 // adjust for pixel center location
2648 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2649 prim[0].x = _simd_add_ps(prim[0].x, offset);
2650 prim[0].y = _simd_add_ps(prim[0].y, offset);
2651
2652 prim[1].x = _simd_add_ps(prim[1].x, offset);
2653 prim[1].y = _simd_add_ps(prim[1].y, offset);
2654
2655 BinPostSetupLines(
2656 pDC,
2657 pa,
2658 workerId,
2659 prim,
2660 vRecipW,
2661 primMask,
2662 primID,
2663 viewportIdx);
2664 }
2665
2666 #if USE_SIMD16_FRONTEND
2667 void SIMDCALL BinLines_simd16(
2668 DRAW_CONTEXT *pDC,
2669 PA_STATE& pa,
2670 uint32_t workerId,
2671 simd16vector prim[3],
2672 uint32_t primMask,
2673 simd16scalari primID)
2674 {
2675 const API_STATE& state = GetApiState(pDC);
2676 const SWR_RASTSTATE& rastState = state.rastState;
2677 const SWR_FRONTEND_STATE& feState = state.frontendState;
2678
2679 simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) };
2680
2681 simd16scalari viewportIdx = _simd16_set1_epi32(0);
2682 if (state.backendState.readViewportArrayIndex)
2683 {
2684 simd16vector vpiAttrib[2];
2685 pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
2686
2687 // OOB indices => forced to zero.
2688 simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
2689 vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
2690 simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
2691 simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
2692 viewportIdx = _simd16_and_si(vClearMask, vpai);
2693 }
2694
2695 if (!feState.vpTransformDisable)
2696 {
2697 // perspective divide
2698 vRecipW[0] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[0].w);
2699 vRecipW[1] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[1].w);
2700
2701 prim[0].v[0] = _simd16_mul_ps(prim[0].v[0], vRecipW[0]);
2702 prim[1].v[0] = _simd16_mul_ps(prim[1].v[0], vRecipW[1]);
2703
2704 prim[0].v[1] = _simd16_mul_ps(prim[0].v[1], vRecipW[0]);
2705 prim[1].v[1] = _simd16_mul_ps(prim[1].v[1], vRecipW[1]);
2706
2707 prim[0].v[2] = _simd16_mul_ps(prim[0].v[2], vRecipW[0]);
2708 prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]);
2709
2710 // viewport transform to screen coords
2711 if (state.backendState.readViewportArrayIndex)
2712 {
2713 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2714 }
2715 else
2716 {
2717 viewportTransform<2>(prim, state.vpMatrices);
2718 }
2719 }
2720
2721 // adjust for pixel center location
2722 simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
2723
2724 prim[0].x = _simd16_add_ps(prim[0].x, offset);
2725 prim[0].y = _simd16_add_ps(prim[0].y, offset);
2726
2727 prim[1].x = _simd16_add_ps(prim[1].x, offset);
2728 prim[1].y = _simd16_add_ps(prim[1].y, offset);
2729
2730 BinPostSetupLines_simd16(
2731 pDC,
2732 pa,
2733 workerId,
2734 prim,
2735 vRecipW,
2736 primMask,
2737 primID,
2738 viewportIdx);
2739 }
2740
2741 #endif