swr/rast: Binner fixes for viewport index offset handling
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(
41 DRAW_CONTEXT *pDC,
42 PA_STATE &pa,
43 uint32_t workerId,
44 typename SIMD_T::Vec4 prim[],
45 typename SIMD_T::Float recipW[],
46 uint32_t primMask,
47 typename SIMD_T::Integer const &primID,
48 typename SIMD_T::Integer const &viewportIdx);
49
50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
51 void BinPostSetupPointsImpl(
52 DRAW_CONTEXT *pDC,
53 PA_STATE &pa,
54 uint32_t workerId,
55 typename SIMD_T::Vec4 prim[],
56 uint32_t primMask,
57 typename SIMD_T::Integer const &primID,
58 typename SIMD_T::Integer const &viewportIdx);
59
60 //////////////////////////////////////////////////////////////////////////
61 /// @brief Processes attributes for the backend based on linkage mask and
62 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
63 /// @param pDC - Draw context
64 /// @param pa - Primitive Assembly state
65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
66 /// @param pLinkageMap - maps VS attribute slot to PS slot
67 /// @param triIndex - Triangle to process attributes for
68 /// @param pBuffer - Output result
69 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
70 INLINE void ProcessAttributes(
71 DRAW_CONTEXT *pDC,
72 PA_STATE&pa,
73 uint32_t triIndex,
74 uint32_t primId,
75 float *pBuffer)
76 {
77 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
78 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
79 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
80 uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
81 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
82 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
83
84 static const float constTable[3][4] = {
85 { 0.0f, 0.0f, 0.0f, 0.0f },
86 { 0.0f, 0.0f, 0.0f, 1.0f },
87 { 1.0f, 1.0f, 1.0f, 1.0f }
88 };
89
90 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
91 {
92 uint32_t inputSlot;
93 if (IsSwizzledT::value)
94 {
95 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
96 inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
97
98 }
99 else
100 {
101 inputSlot = backendState.vertexAttribOffset + i;
102 }
103
104 simd4scalar attrib[3]; // triangle attribs (always 4 wide)
105 float* pAttribStart = pBuffer;
106
107 if (HasConstantInterpT::value || IsDegenerate::value)
108 {
109 if (CheckBit(constantInterpMask, i))
110 {
111 uint32_t vid;
112 uint32_t adjustedTriIndex;
113 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
114 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
115 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
116 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
117 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
118
119 switch (topo) {
120 case TOP_QUAD_LIST:
121 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
122 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
123 break;
124 case TOP_QUAD_STRIP:
125 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
126 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
127 break;
128 case TOP_TRIANGLE_STRIP:
129 adjustedTriIndex = triIndex;
130 vid = (triIndex & 1)
131 ? tristripProvokingVertex[provokingVertex]
132 : provokingVertex;
133 break;
134 default:
135 adjustedTriIndex = triIndex;
136 vid = provokingVertex;
137 break;
138 }
139
140 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
141
142 for (uint32_t i = 0; i < NumVertsT::value; ++i)
143 {
144 SIMD128::store_ps(pBuffer, attrib[vid]);
145 pBuffer += 4;
146 }
147 }
148 else
149 {
150 pa.AssembleSingle(inputSlot, triIndex, attrib);
151
152 for (uint32_t i = 0; i < NumVertsT::value; ++i)
153 {
154 SIMD128::store_ps(pBuffer, attrib[i]);
155 pBuffer += 4;
156 }
157 }
158 }
159 else
160 {
161 pa.AssembleSingle(inputSlot, triIndex, attrib);
162
163 for (uint32_t i = 0; i < NumVertsT::value; ++i)
164 {
165 SIMD128::store_ps(pBuffer, attrib[i]);
166 pBuffer += 4;
167 }
168 }
169
170 // pad out the attrib buffer to 3 verts to ensure the triangle
171 // interpolation code in the pixel shader works correctly for the
172 // 3 topologies - point, line, tri. This effectively zeros out the
173 // effect of the missing vertices in the triangle interpolation.
174 for (uint32_t v = NumVertsT::value; v < 3; ++v)
175 {
176 SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
177 pBuffer += 4;
178 }
179
180 // check for constant source overrides
181 if (IsSwizzledT::value)
182 {
183 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
184 if (mask)
185 {
186 DWORD comp;
187 while (_BitScanForward(&comp, mask))
188 {
189 mask &= ~(1 << comp);
190
191 float constantValue = 0.0f;
192 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
193 {
194 case SWR_CONSTANT_SOURCE_CONST_0000:
195 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
196 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
197 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
198 break;
199 case SWR_CONSTANT_SOURCE_PRIM_ID:
200 constantValue = *(float*)&primId;
201 break;
202 }
203
204 // apply constant value to all 3 vertices
205 for (uint32_t v = 0; v < 3; ++v)
206 {
207 pAttribStart[comp + v * 4] = constantValue;
208 }
209 }
210 }
211 }
212 }
213 }
214
215 //////////////////////////////////////////////////////////////////////////
216 /// @brief Gather scissor rect data based on per-prim viewport indices.
217 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
218 /// @param pViewportIndex - array of per-primitive vewport indexes.
219 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
220 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
221 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
222 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
223 //
224 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
225 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
226 simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax)
227 {
228 scisXmin = _simd_set_epi32(
229 pScissorsInFixedPoint[pViewportIndex[0]].xmin,
230 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
231 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
232 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
233 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
234 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
235 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
236 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
237 scisYmin = _simd_set_epi32(
238 pScissorsInFixedPoint[pViewportIndex[0]].ymin,
239 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
240 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
241 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
242 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
243 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
244 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
245 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
246 scisXmax = _simd_set_epi32(
247 pScissorsInFixedPoint[pViewportIndex[0]].xmax,
248 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
249 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
250 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
251 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
252 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
253 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
254 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
255 scisYmax = _simd_set_epi32(
256 pScissorsInFixedPoint[pViewportIndex[0]].ymax,
257 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
258 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
259 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
260 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
261 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
262 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
263 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
264 }
265
266 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
267 simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax)
268 {
269 scisXmin = _simd16_set_epi32(
270 pScissorsInFixedPoint[pViewportIndex[0]].xmin,
271 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
272 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
273 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
274 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
275 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
276 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
277 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
278 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
279 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
280 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
281 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
282 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
283 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
284 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
285 pScissorsInFixedPoint[pViewportIndex[15]].xmin);
286
287 scisYmin = _simd16_set_epi32(
288 pScissorsInFixedPoint[pViewportIndex[0]].ymin,
289 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
290 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
291 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
292 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
293 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
294 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
295 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
296 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
297 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
298 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
299 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
300 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
301 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
302 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
303 pScissorsInFixedPoint[pViewportIndex[15]].ymin);
304
305 scisXmax = _simd16_set_epi32(
306 pScissorsInFixedPoint[pViewportIndex[0]].xmax,
307 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
308 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
309 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
310 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
311 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
312 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
313 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
314 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
315 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
316 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
317 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
318 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
319 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
320 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
321 pScissorsInFixedPoint[pViewportIndex[15]].xmax);
322
323 scisYmax = _simd16_set_epi32(
324 pScissorsInFixedPoint[pViewportIndex[0]].ymax,
325 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
326 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
327 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
328 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
329 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
330 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
331 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
332 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
333 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
334 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
335 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
336 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
337 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
338 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
339 pScissorsInFixedPoint[pViewportIndex[15]].ymax);
340 }
341
342 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
343
344 struct ProcessAttributesChooser
345 {
346 typedef PFN_PROCESS_ATTRIBUTES FuncType;
347
348 template <typename... ArgsB>
349 static FuncType GetFunc()
350 {
351 return ProcessAttributes<ArgsB...>;
352 }
353 };
354
355 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
356 {
357 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
358 }
359
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief Processes enabled user clip distances. Loads the active clip
362 /// distances from the PA, sets up barycentric equations, and
363 /// stores the results to the output buffer
364 /// @param pa - Primitive Assembly state
365 /// @param primIndex - primitive index to process
366 /// @param clipDistMask - mask of enabled clip distances
367 /// @param pUserClipBuffer - buffer to store results
368 template<uint32_t NumVerts>
369 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
370 {
371 DWORD clipDist;
372 uint32_t clipDistMask = state.clipDistanceMask;
373 while (_BitScanForward(&clipDist, clipDistMask))
374 {
375 clipDistMask &= ~(1 << clipDist);
376 uint32_t clipSlot = clipDist >> 2;
377 uint32_t clipComp = clipDist & 0x3;
378 uint32_t clipAttribSlot = clipSlot == 0 ?
379 state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
380
381 simd4scalar primClipDist[3];
382 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
383
384 float vertClipDist[NumVerts];
385 for (uint32_t e = 0; e < NumVerts; ++e)
386 {
387 OSALIGNSIMD(float) aVertClipDist[4];
388 SIMD128::store_ps(aVertClipDist, primClipDist[e]);
389 vertClipDist[e] = aVertClipDist[clipComp];
390 };
391
392 // setup plane equations for barycentric interpolation in the backend
393 float baryCoeff[NumVerts];
394 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
395 for (uint32_t e = 0; e < NumVerts - 1; ++e)
396 {
397 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
398 }
399 baryCoeff[NumVerts - 1] = last;
400
401 for (uint32_t e = 0; e < NumVerts; ++e)
402 {
403 *(pUserClipBuffer++) = baryCoeff[e];
404 }
405 }
406 }
407
408 INLINE
409 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
410 {
411 vTranspose3x8(dst, src0, src1, src2);
412 }
413
414 INLINE
415 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
416 {
417 vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
418 }
419
420 //////////////////////////////////////////////////////////////////////////
421 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
422 /// culling, viewport transform, etc.
423 /// @param pDC - pointer to draw context.
424 /// @param pa - The primitive assembly object.
425 /// @param workerId - thread's worker id. Even thread has a unique id.
426 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
427 /// @param primID - Primitive ID for each triangle.
428 /// @param viewportIdx - viewport array index for each triangle.
429 /// @tparam CT - ConservativeRastFETraits
430 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
431 void SIMDCALL BinTrianglesImpl(
432 DRAW_CONTEXT *pDC,
433 PA_STATE &pa,
434 uint32_t workerId,
435 typename SIMD_T::Vec4 tri[3],
436 uint32_t triMask,
437 typename SIMD_T::Integer const &primID)
438 {
439 SWR_CONTEXT *pContext = pDC->pContext;
440
441 AR_BEGIN(FEBinTriangles, pDC->drawId);
442
443 const API_STATE& state = GetApiState(pDC);
444 const SWR_RASTSTATE& rastState = state.rastState;
445 const SWR_FRONTEND_STATE& feState = state.frontendState;
446
447 MacroTileMgr *pTileMgr = pDC->pTileMgr;
448
449 typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
450 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
451 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
452
453 typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
454 typename SIMD_T::Vec4 vpiAttrib[3];
455 typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
456
457 if (state.backendState.readViewportArrayIndex)
458 {
459 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
460
461 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
462 }
463
464
465 if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
466 {
467 // OOB indices => forced to zero.
468 vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
469 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
470 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
471 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
472 }
473 else
474 {
475 viewportIdx = vpai;
476 }
477
478 if (feState.vpTransformDisable)
479 {
480 // RHW is passed in directly when VP transform is disabled
481 vRecipW0 = tri[0].v[3];
482 vRecipW1 = tri[1].v[3];
483 vRecipW2 = tri[2].v[3];
484 }
485 else
486 {
487 // Perspective divide
488 vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
489 vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
490 vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
491
492 tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
493 tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
494 tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
495
496 tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
497 tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
498 tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
499
500 tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
501 tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
502 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
503
504 // Viewport transform to screen space coords
505 if (state.backendState.readViewportArrayIndex)
506 {
507 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
508 }
509 else
510 {
511 viewportTransform<3>(tri, state.vpMatrices);
512 }
513 }
514
515 // Adjust for pixel center location
516 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
517
518 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
519 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
520
521 tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
522 tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
523
524 tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
525 tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
526
527 // Set vXi, vYi to required fixed point precision
528 typename SIMD_T::Integer vXi[3], vYi[3];
529 FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
530
531 // triangle setup
532 typename SIMD_T::Integer vAi[3], vBi[3];
533 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
534
535 // determinant
536 typename SIMD_T::Integer vDet[2];
537 calcDeterminantIntVertical(vAi, vBi, vDet);
538
539 // cull zero area
540 uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
541 uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
542
543 uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
544
545 // don't cull degenerate triangles if we're conservatively rasterizing
546 uint32_t origTriMask = triMask;
547 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
548 {
549 triMask &= ~cullZeroAreaMask;
550 }
551
552 // determine front winding tris
553 // CW +det
554 // CCW det < 0;
555 // 0 area triangles are marked as backfacing regardless of winding order,
556 // which is required behavior for conservative rast and wireframe rendering
557 uint32_t frontWindingTris;
558 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
559 {
560 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
561 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
562 }
563 else
564 {
565 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
566 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
567 }
568 frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
569
570 // cull
571 uint32_t cullTris;
572 switch ((SWR_CULLMODE)rastState.cullMode)
573 {
574 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
575 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
576 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
577 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
578 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
579 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
580 }
581
582 triMask &= ~cullTris;
583
584 if (origTriMask ^ triMask)
585 {
586 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
587 }
588
589 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
590 // compute per tri backface
591 uint32_t frontFaceMask = frontWindingTris;
592 uint32_t *pPrimID = (uint32_t *)&primID;
593 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
594 DWORD triIndex = 0;
595
596 uint32_t edgeEnable;
597 PFN_WORK_FUNC pfnWork;
598 if (CT::IsConservativeT::value)
599 {
600 // determine which edges of the degenerate tri, if any, are valid to rasterize.
601 // used to call the appropriate templated rasterizer function
602 if (cullZeroAreaMask > 0)
603 {
604 // e0 = v1-v0
605 const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
606 const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
607
608 uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
609
610 // e1 = v2-v1
611 const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
612 const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
613
614 uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
615
616 // e2 = v0-v2
617 // if v0 == v1 & v1 == v2, v0 == v2
618 uint32_t e2Mask = e0Mask & e1Mask;
619 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
620
621 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
622 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
623 e0Mask = pdep_u32(e0Mask, 0x00249249);
624
625 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
626 e1Mask = pdep_u32(e1Mask, 0x00492492);
627
628 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
629 e2Mask = pdep_u32(e2Mask, 0x00924924);
630
631 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
632 }
633 else
634 {
635 edgeEnable = 0x00FFFFFF;
636 }
637 }
638 else
639 {
640 // degenerate triangles won't be sent to rasterizer; just enable all edges
641 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
642 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
643 }
644
645 SIMDBBOX_T<SIMD_T> bbox;
646
647 if (!triMask)
648 {
649 goto endBinTriangles;
650 }
651
652 // Calc bounding box of triangles
653 calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
654
655 // determine if triangle falls between pixel centers and discard
656 // only discard for non-MSAA case and when conservative rast is disabled
657 // (xmin + 127) & ~255
658 // (xmax + 128) & ~255
659 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
660 (!CT::IsConservativeT::value))
661 {
662 origTriMask = triMask;
663
664 int cullCenterMask;
665
666 {
667 typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
668 xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
669 typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
670 xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
671
672 typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
673
674 typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
675 ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
676 typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
677 ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
678
679 typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
680
681 vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
682 cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
683 }
684
685 triMask &= ~cullCenterMask;
686
687 if (origTriMask ^ triMask)
688 {
689 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
690 }
691 }
692
693 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
694 // Gather the AOS effective scissor rects based on the per-prim VP index.
695 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
696 {
697 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
698
699 if (state.backendState.readViewportArrayIndex)
700 {
701 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
702 }
703 else // broadcast fast path for non-VPAI case.
704 {
705 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
706 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
707 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
708 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
709 }
710
711 // Make triangle bbox inclusive
712 bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
713 bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
714
715 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
716 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
717 bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
718 bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
719 }
720
721 if (CT::IsConservativeT::value)
722 {
723 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
724 // some area. Bump the xmax/ymax edges out
725
726 typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
727 bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
728
729 typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
730 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
731 }
732
733 // Cull tris completely outside scissor
734 {
735 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
736 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
737 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
738 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
739 triMask = triMask & ~maskOutsideScissor;
740 }
741
742 endBinTriangles:
743
744
745 // Send surviving triangles to the line or point binner based on fill mode
746 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
747 {
748 // Simple non-conformant wireframe mode, useful for debugging
749 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
750 typename SIMD_T::Vec4 line[2];
751 typename SIMD_T::Float recipW[2];
752
753 line[0] = tri[0];
754 line[1] = tri[1];
755 recipW[0] = vRecipW0;
756 recipW[1] = vRecipW1;
757
758 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
759
760 line[0] = tri[1];
761 line[1] = tri[2];
762 recipW[0] = vRecipW1;
763 recipW[1] = vRecipW2;
764
765 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
766
767 line[0] = tri[2];
768 line[1] = tri[0];
769 recipW[0] = vRecipW2;
770 recipW[1] = vRecipW0;
771
772 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
773
774 AR_END(FEBinTriangles, 1);
775 return;
776 }
777 else if (rastState.fillMode == SWR_FILLMODE_POINT)
778 {
779 // Bin 3 points
780 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
781 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
782 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
783
784 AR_END(FEBinTriangles, 1);
785 return;
786 }
787
788 // Convert triangle bbox to macrotile units.
789 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
790 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
791 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
792 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
793
794 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
795
796 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
797 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
798 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
799 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
800
801 // transpose verts needed for backend
802 /// @todo modify BE to take non-transformed verts
803 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
804 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
805 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
806 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
807
808 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
809 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
810 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
811 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
812
813 // store render target array index
814 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
815 if (state.backendState.readRenderTargetArrayIndex)
816 {
817 typename SIMD_T::Vec4 vRtai[3];
818 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
819 typename SIMD_T::Integer vRtaii;
820 vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
821 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
822 }
823 else
824 {
825 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
826 }
827
828
829 // scan remaining valid triangles and bin each separately
830 while (_BitScanForward(&triIndex, triMask))
831 {
832 uint32_t linkageCount = state.backendState.numAttributes;
833 uint32_t numScalarAttribs = linkageCount * 4;
834
835 BE_WORK work;
836 work.type = DRAW;
837
838 bool isDegenerate;
839 if (CT::IsConservativeT::value)
840 {
841 // only rasterize valid edges if we have a degenerate primitive
842 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
843 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
844 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
845
846 // Degenerate triangles are required to be constant interpolated
847 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
848 }
849 else
850 {
851 isDegenerate = false;
852 work.pfnWork = pfnWork;
853 }
854
855 // Select attribute processor
856 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
857 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
858
859 TRIANGLE_WORK_DESC &desc = work.desc.tri;
860
861 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
862 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
863 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
864
865 auto pArena = pDC->pArena;
866 SWR_ASSERT(pArena != nullptr);
867
868 // store active attribs
869 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
870 desc.pAttribs = pAttribs;
871 desc.numAttribs = linkageCount;
872 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
873
874 // store triangle vertex data
875 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
876
877 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
878 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
879 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
880 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
881
882 // store user clip distances
883 if (state.backendState.clipDistanceMask)
884 {
885 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
886 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
887 ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
888 }
889
890 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
891 {
892 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
893 {
894 #if KNOB_ENABLE_TOSS_POINTS
895 if (!KNOB_TOSS_SETUP_TRIS)
896 #endif
897 {
898 pTileMgr->enqueue(x, y, &work);
899 }
900 }
901 }
902
903 triMask &= ~(1 << triIndex);
904 }
905
906 AR_END(FEBinTriangles, 1);
907 }
908
909 template <typename CT>
910 void BinTriangles(
911 DRAW_CONTEXT *pDC,
912 PA_STATE &pa,
913 uint32_t workerId,
914 simdvector tri[3],
915 uint32_t triMask,
916 simdscalari const &primID)
917 {
918 BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
919 }
920
921 #if USE_SIMD16_FRONTEND
922 template <typename CT>
923 void SIMDCALL BinTriangles_simd16(
924 DRAW_CONTEXT *pDC,
925 PA_STATE &pa,
926 uint32_t workerId,
927 simd16vector tri[3],
928 uint32_t triMask,
929 simd16scalari const &primID)
930 {
931 BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
932 }
933
934 #endif
935 struct FEBinTrianglesChooser
936 {
937 typedef PFN_PROCESS_PRIMS FuncType;
938
939 template <typename... ArgsB>
940 static FuncType GetFunc()
941 {
942 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
943 }
944 };
945
946 // Selector for correct templated BinTrinagles function
947 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
948 {
949 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
950 }
951
952 #if USE_SIMD16_FRONTEND
953 struct FEBinTrianglesChooser_simd16
954 {
955 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
956
957 template <typename... ArgsB>
958 static FuncType GetFunc()
959 {
960 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
961 }
962 };
963
964 // Selector for correct templated BinTrinagles function
965 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
966 {
967 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
968 }
969
970 #endif
971
972 template <typename SIMD_T, uint32_t SIMD_WIDTH>
973 void BinPostSetupPointsImpl(
974 DRAW_CONTEXT *pDC,
975 PA_STATE &pa,
976 uint32_t workerId,
977 typename SIMD_T::Vec4 prim[],
978 uint32_t primMask,
979 typename SIMD_T::Integer const &primID,
980 typename SIMD_T::Integer const &viewportIdx)
981 {
982 SWR_CONTEXT *pContext = pDC->pContext;
983
984 AR_BEGIN(FEBinPoints, pDC->drawId);
985
986 typename SIMD_T::Vec4 &primVerts = prim[0];
987
988 const API_STATE& state = GetApiState(pDC);
989 const SWR_RASTSTATE& rastState = state.rastState;
990 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
991
992 // Select attribute processor
993 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
994 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
995
996 // convert to fixed point
997 typename SIMD_T::Integer vXi, vYi;
998
999 vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1000 vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1001
1002 if (CanUseSimplePoints(pDC))
1003 {
1004 // adjust for ymin-xmin rule
1005 vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1006 vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1007
1008 // cull points off the ymin-xmin edge of the viewport
1009 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1010 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1011
1012 // compute macro tile coordinates
1013 typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1014 typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1015
1016 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1017
1018 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1019 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1020
1021 // compute raster tile coordinates
1022 typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1023 typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1024
1025 // compute raster tile relative x,y for coverage mask
1026 typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1027 typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1028
1029 typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1030 typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1031
1032 OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1033 OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1034
1035 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1036 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1037
1038 OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1039 OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1040
1041 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1042 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1043
1044 OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1045 SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1046
1047 // store render target array index
1048 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1049 if (state.backendState.readRenderTargetArrayIndex)
1050 {
1051 typename SIMD_T::Vec4 vRtai;
1052 pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
1053 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
1054 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1055 }
1056 else
1057 {
1058 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1059 }
1060
1061 uint32_t *pPrimID = (uint32_t *)&primID;
1062 DWORD primIndex = 0;
1063
1064 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1065
1066 // scan remaining valid triangles and bin each separately
1067 while (_BitScanForward(&primIndex, primMask))
1068 {
1069 uint32_t linkageCount = backendState.numAttributes;
1070 uint32_t numScalarAttribs = linkageCount * 4;
1071
1072 BE_WORK work;
1073 work.type = DRAW;
1074
1075 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1076
1077 // points are always front facing
1078 desc.triFlags.frontFacing = 1;
1079 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1080 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1081
1082 work.pfnWork = RasterizeSimplePoint;
1083
1084 auto pArena = pDC->pArena;
1085 SWR_ASSERT(pArena != nullptr);
1086
1087 // store attributes
1088 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1089 desc.pAttribs = pAttribs;
1090 desc.numAttribs = linkageCount;
1091
1092 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1093
1094 // store raster tile aligned x, y, perspective correct z
1095 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1096 desc.pTriBuffer = pTriBuffer;
1097 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1098 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1099 *pTriBuffer = aZ[primIndex];
1100
1101 uint32_t tX = aTileRelativeX[primIndex];
1102 uint32_t tY = aTileRelativeY[primIndex];
1103
1104 // pack the relative x,y into the coverageMask, the rasterizer will
1105 // generate the true coverage mask from it
1106 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1107
1108 // bin it
1109 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1110 #if KNOB_ENABLE_TOSS_POINTS
1111 if (!KNOB_TOSS_SETUP_TRIS)
1112 #endif
1113 {
1114 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1115 }
1116
1117 primMask &= ~(1 << primIndex);
1118 }
1119 }
1120 else
1121 {
1122 // non simple points need to be potentially binned to multiple macro tiles
1123 typename SIMD_T::Float vPointSize;
1124
1125 if (rastState.pointParam)
1126 {
1127 typename SIMD_T::Vec4 size[3];
1128 pa.Assemble(VERTEX_SGV_SLOT, size);
1129 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1130 }
1131 else
1132 {
1133 vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1134 }
1135
1136 // bloat point to bbox
1137 SIMDBBOX_T<SIMD_T> bbox;
1138
1139 bbox.xmin = bbox.xmax = vXi;
1140 bbox.ymin = bbox.ymax = vYi;
1141
1142 typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1143 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1144
1145 bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1146 bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1147 bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1148 bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1149
1150 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1151 // Gather the AOS effective scissor rects based on the per-prim VP index.
1152 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1153 {
1154 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1155
1156 if (state.backendState.readViewportArrayIndex)
1157 {
1158 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1159 }
1160 else // broadcast fast path for non-VPAI case.
1161 {
1162 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1163 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1164 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1165 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1166 }
1167
1168 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1169 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1170 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1171 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1172 }
1173
1174 // Cull bloated points completely outside scissor
1175 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1176 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1177 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1178 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1179 primMask = primMask & ~maskOutsideScissor;
1180
1181 // Convert bbox to macrotile units.
1182 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1183 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1184 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1185 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1186
1187 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1188
1189 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1190 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1191 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1192 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1193
1194 // store render target array index
1195 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1196 if (state.backendState.readRenderTargetArrayIndex)
1197 {
1198 typename SIMD_T::Vec4 vRtai[2];
1199 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1200 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1201 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1202 }
1203 else
1204 {
1205 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1206 }
1207
1208 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1209 SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1210
1211 uint32_t *pPrimID = (uint32_t *)&primID;
1212
1213 OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1214 OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1215 OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1216
1217 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1218 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1219 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1220
1221 // scan remaining valid prims and bin each separately
1222 const SWR_BACKEND_STATE& backendState = state.backendState;
1223 DWORD primIndex;
1224 while (_BitScanForward(&primIndex, primMask))
1225 {
1226 uint32_t linkageCount = backendState.numAttributes;
1227 uint32_t numScalarAttribs = linkageCount * 4;
1228
1229 BE_WORK work;
1230 work.type = DRAW;
1231
1232 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1233
1234 desc.triFlags.frontFacing = 1;
1235 desc.triFlags.pointSize = aPointSize[primIndex];
1236 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1237 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1238
1239 work.pfnWork = RasterizeTriPoint;
1240
1241 auto pArena = pDC->pArena;
1242 SWR_ASSERT(pArena != nullptr);
1243
1244 // store active attribs
1245 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1246 desc.numAttribs = linkageCount;
1247 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1248
1249 // store point vertex data
1250 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1251 desc.pTriBuffer = pTriBuffer;
1252 *pTriBuffer++ = aPrimVertsX[primIndex];
1253 *pTriBuffer++ = aPrimVertsY[primIndex];
1254 *pTriBuffer = aPrimVertsZ[primIndex];
1255
1256 // store user clip distances
1257 if (backendState.clipDistanceMask)
1258 {
1259 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1260 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1261 float dists[8];
1262 float one = 1.0f;
1263 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1264 for (uint32_t i = 0; i < numClipDist; i++) {
1265 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1266 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1267 desc.pUserClipBuffer[3 * i + 2] = dists[i];
1268 }
1269 }
1270
1271 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1272 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1273 {
1274 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1275 {
1276 #if KNOB_ENABLE_TOSS_POINTS
1277 if (!KNOB_TOSS_SETUP_TRIS)
1278 #endif
1279 {
1280 pTileMgr->enqueue(x, y, &work);
1281 }
1282 }
1283 }
1284
1285 primMask &= ~(1 << primIndex);
1286 }
1287 }
1288
1289 AR_END(FEBinPoints, 1);
1290 }
1291
1292 //////////////////////////////////////////////////////////////////////////
1293 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1294 /// @param pDC - pointer to draw context.
1295 /// @param pa - The primitive assembly object.
1296 /// @param workerId - thread's worker id. Even thread has a unique id.
1297 /// @param tri - Contains point position data for SIMDs worth of points.
1298 /// @param primID - Primitive ID for each point.
1299 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1300 void BinPointsImpl(
1301 DRAW_CONTEXT *pDC,
1302 PA_STATE &pa,
1303 uint32_t workerId,
1304 typename SIMD_T::Vec4 prim[3],
1305 uint32_t primMask,
1306 typename SIMD_T::Integer const &primID)
1307 {
1308 const API_STATE& state = GetApiState(pDC);
1309 const SWR_FRONTEND_STATE& feState = state.frontendState;
1310 const SWR_RASTSTATE& rastState = state.rastState;
1311
1312 // Read back viewport index if required
1313 typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1314 typename SIMD_T::Vec4 vpiAttrib[1];
1315 typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1316
1317 if (state.backendState.readViewportArrayIndex)
1318 {
1319 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1320
1321 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1322 }
1323
1324
1325 if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1326 {
1327 // OOB indices => forced to zero.
1328 vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1329 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1330 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1331 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1332 }
1333 else
1334 {
1335 viewportIdx = vpai;
1336 }
1337
1338 if (!feState.vpTransformDisable)
1339 {
1340 // perspective divide
1341 typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1342
1343 prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1344 prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1345 prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1346
1347 // viewport transform to screen coords
1348 if (state.backendState.readViewportArrayIndex)
1349 {
1350 viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1351 }
1352 else
1353 {
1354 viewportTransform<1>(prim, state.vpMatrices);
1355 }
1356 }
1357
1358 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1359
1360 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1361 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1362
1363 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1364 pDC,
1365 pa,
1366 workerId,
1367 prim,
1368 primMask,
1369 primID,
1370 viewportIdx);
1371 }
1372
1373 void BinPoints(
1374 DRAW_CONTEXT *pDC,
1375 PA_STATE &pa,
1376 uint32_t workerId,
1377 simdvector prim[3],
1378 uint32_t primMask,
1379 simdscalari const &primID)
1380 {
1381 BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1382 pDC,
1383 pa,
1384 workerId,
1385 prim,
1386 primMask,
1387 primID);
1388 }
1389
1390 #if USE_SIMD16_FRONTEND
1391 void SIMDCALL BinPoints_simd16(
1392 DRAW_CONTEXT *pDC,
1393 PA_STATE &pa,
1394 uint32_t workerId,
1395 simd16vector prim[3],
1396 uint32_t primMask,
1397 simd16scalari const &primID)
1398 {
1399 BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1400 pDC,
1401 pa,
1402 workerId,
1403 prim,
1404 primMask,
1405 primID);
1406 }
1407
1408 #endif
1409 //////////////////////////////////////////////////////////////////////////
1410 /// @brief Bin SIMD lines to the backend.
1411 /// @param pDC - pointer to draw context.
1412 /// @param pa - The primitive assembly object.
1413 /// @param workerId - thread's worker id. Even thread has a unique id.
1414 /// @param tri - Contains line position data for SIMDs worth of points.
1415 /// @param primID - Primitive ID for each line.
1416 /// @param viewportIdx - Viewport Array Index for each line.
1417 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1418 void BinPostSetupLinesImpl(
1419 DRAW_CONTEXT *pDC,
1420 PA_STATE &pa,
1421 uint32_t workerId,
1422 typename SIMD_T::Vec4 prim[],
1423 typename SIMD_T::Float recipW[],
1424 uint32_t primMask,
1425 typename SIMD_T::Integer const &primID,
1426 typename SIMD_T::Integer const &viewportIdx)
1427 {
1428 SWR_CONTEXT *pContext = pDC->pContext;
1429
1430 AR_BEGIN(FEBinLines, pDC->drawId);
1431
1432 const API_STATE &state = GetApiState(pDC);
1433 const SWR_RASTSTATE &rastState = state.rastState;
1434
1435 // Select attribute processor
1436 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1437 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1438
1439 typename SIMD_T::Float &vRecipW0 = recipW[0];
1440 typename SIMD_T::Float &vRecipW1 = recipW[1];
1441
1442 // convert to fixed point
1443 typename SIMD_T::Integer vXi[2], vYi[2];
1444
1445 vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1446 vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1447 vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1448 vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1449
1450 // compute x-major vs y-major mask
1451 typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1452 typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1453 typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1454 uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1455
1456 // cull zero-length lines
1457 typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1458 vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1459
1460 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1461
1462 uint32_t *pPrimID = (uint32_t *)&primID;
1463 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1464
1465 // Calc bounding box of lines
1466 SIMDBBOX_T<SIMD_T> bbox;
1467 bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1468 bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1469 bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1470 bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1471
1472 // bloat bbox by line width along minor axis
1473 typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1474 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1475
1476 SIMDBBOX_T<SIMD_T> bloatBox;
1477
1478 bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1479 bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1480 bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1481 bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1482
1483 bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1484 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1485 bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1486 bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1487
1488 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1489 {
1490 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1491
1492 if (state.backendState.readViewportArrayIndex)
1493 {
1494 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1495 }
1496 else // broadcast fast path for non-VPAI case.
1497 {
1498 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1499 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1500 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1501 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1502 }
1503
1504 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1505 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1506 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1507 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1508 }
1509
1510 // Cull prims completely outside scissor
1511 {
1512 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1513 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1514 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1515 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1516 primMask = primMask & ~maskOutsideScissor;
1517 }
1518
1519 // transpose verts needed for backend
1520 /// @todo modify BE to take non-transformed verts
1521 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1522 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1523 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1524 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1525
1526 if (!primMask)
1527 {
1528 goto endBinLines;
1529 }
1530
1531 // Convert triangle bbox to macrotile units.
1532 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1533 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1534 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1535 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1536
1537 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1538
1539 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1540 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1541 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1542 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1543
1544 TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1545 TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1546 TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1547 TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
1548
1549 // store render target array index
1550 OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1551 if (state.backendState.readRenderTargetArrayIndex)
1552 {
1553 typename SIMD_T::Vec4 vRtai[2];
1554 pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1555 typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1556 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1557 }
1558 else
1559 {
1560 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1561 }
1562
1563 // scan remaining valid prims and bin each separately
1564 DWORD primIndex;
1565 while (_BitScanForward(&primIndex, primMask))
1566 {
1567 uint32_t linkageCount = state.backendState.numAttributes;
1568 uint32_t numScalarAttribs = linkageCount * 4;
1569
1570 BE_WORK work;
1571 work.type = DRAW;
1572
1573 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1574
1575 desc.triFlags.frontFacing = 1;
1576 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1577 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1578 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1579
1580 work.pfnWork = RasterizeLine;
1581
1582 auto pArena = pDC->pArena;
1583 SWR_ASSERT(pArena != nullptr);
1584
1585 // store active attribs
1586 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1587 desc.numAttribs = linkageCount;
1588 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1589
1590 // store line vertex data
1591 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1592
1593 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1594 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1595 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1596 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1597
1598 // store user clip distances
1599 if (state.backendState.clipDistanceMask)
1600 {
1601 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1602 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1603 ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1604 }
1605
1606 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1607 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1608 {
1609 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1610 {
1611 #if KNOB_ENABLE_TOSS_POINTS
1612 if (!KNOB_TOSS_SETUP_TRIS)
1613 #endif
1614 {
1615 pTileMgr->enqueue(x, y, &work);
1616 }
1617 }
1618 }
1619
1620 primMask &= ~(1 << primIndex);
1621 }
1622
1623 endBinLines:
1624
1625 AR_END(FEBinLines, 1);
1626 }
1627
1628 //////////////////////////////////////////////////////////////////////////
1629 /// @brief Bin SIMD lines to the backend.
1630 /// @param pDC - pointer to draw context.
1631 /// @param pa - The primitive assembly object.
1632 /// @param workerId - thread's worker id. Even thread has a unique id.
1633 /// @param tri - Contains line position data for SIMDs worth of points.
1634 /// @param primID - Primitive ID for each line.
1635 /// @param viewportIdx - Viewport Array Index for each line.
1636 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1637 void SIMDCALL BinLinesImpl(
1638 DRAW_CONTEXT *pDC,
1639 PA_STATE &pa,
1640 uint32_t workerId,
1641 typename SIMD_T::Vec4 prim[3],
1642 uint32_t primMask,
1643 typename SIMD_T::Integer const &primID)
1644 {
1645 const API_STATE& state = GetApiState(pDC);
1646 const SWR_RASTSTATE& rastState = state.rastState;
1647 const SWR_FRONTEND_STATE& feState = state.frontendState;
1648
1649 typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1650
1651 typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1652 typename SIMD_T::Vec4 vpiAttrib[2];
1653 typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1654
1655 if (state.backendState.readViewportArrayIndex)
1656 {
1657 pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1658 vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1659 }
1660
1661
1662 if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1663 {
1664 // OOB indices => forced to zero.
1665 vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1666 typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1667 typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1668 viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1669 }
1670
1671 if (!feState.vpTransformDisable)
1672 {
1673 // perspective divide
1674 vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1675 vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1676
1677 prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1678 prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1679
1680 prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1681 prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1682
1683 prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1684 prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1685
1686 // viewport transform to screen coords
1687 if (state.backendState.readViewportArrayIndex)
1688 {
1689 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1690 }
1691 else
1692 {
1693 viewportTransform<2>(prim, state.vpMatrices);
1694 }
1695 }
1696
1697 // adjust for pixel center location
1698 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1699
1700 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1701 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1702
1703 prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1704 prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1705
1706 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1707 pDC,
1708 pa,
1709 workerId,
1710 prim,
1711 vRecipW,
1712 primMask,
1713 primID,
1714 viewportIdx);
1715 }
1716
1717 void BinLines(
1718 DRAW_CONTEXT *pDC,
1719 PA_STATE &pa,
1720 uint32_t workerId,
1721 simdvector prim[],
1722 uint32_t primMask,
1723 simdscalari const &primID)
1724 {
1725 BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1726 }
1727
1728 #if USE_SIMD16_FRONTEND
1729 void SIMDCALL BinLines_simd16(
1730 DRAW_CONTEXT *pDC,
1731 PA_STATE &pa,
1732 uint32_t workerId,
1733 simd16vector prim[3],
1734 uint32_t primMask,
1735 simd16scalari const &primID)
1736 {
1737 BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1738 }
1739
1740 #endif