swr/rast: More precise user clip distance interpolation
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(
41 DRAW_CONTEXT *pDC,
42 PA_STATE &pa,
43 uint32_t workerId,
44 typename SIMD_T::Vec4 prim[],
45 typename SIMD_T::Float recipW[],
46 uint32_t primMask,
47 typename SIMD_T::Integer const &primID,
48 typename SIMD_T::Integer const &viewportIdx,
49 typename SIMD_T::Integer const &rtIdx);
50
51 template <typename SIMD_T, uint32_t SIMD_WIDTH>
52 void BinPostSetupPointsImpl(
53 DRAW_CONTEXT *pDC,
54 PA_STATE &pa,
55 uint32_t workerId,
56 typename SIMD_T::Vec4 prim[],
57 uint32_t primMask,
58 typename SIMD_T::Integer const &primID,
59 typename SIMD_T::Integer const &viewportIdx,
60 typename SIMD_T::Integer const &rtIdx);
61
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief Processes attributes for the backend based on linkage mask and
64 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
65 /// @param pDC - Draw context
66 /// @param pa - Primitive Assembly state
67 /// @param linkageMask - Specifies which VS outputs are routed to PS.
68 /// @param pLinkageMap - maps VS attribute slot to PS slot
69 /// @param triIndex - Triangle to process attributes for
70 /// @param pBuffer - Output result
71 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
72 INLINE void ProcessAttributes(
73 DRAW_CONTEXT *pDC,
74 PA_STATE&pa,
75 uint32_t triIndex,
76 uint32_t primId,
77 float *pBuffer)
78 {
79 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
80 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
81 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
82 uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
83 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
84 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
85
86 static const float constTable[3][4] = {
87 { 0.0f, 0.0f, 0.0f, 0.0f },
88 { 0.0f, 0.0f, 0.0f, 1.0f },
89 { 1.0f, 1.0f, 1.0f, 1.0f }
90 };
91
92 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
93 {
94 uint32_t inputSlot;
95 if (IsSwizzledT::value)
96 {
97 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
98 inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
99
100 }
101 else
102 {
103 inputSlot = backendState.vertexAttribOffset + i;
104 }
105
106 simd4scalar attrib[3]; // triangle attribs (always 4 wide)
107 float* pAttribStart = pBuffer;
108
109 if (HasConstantInterpT::value || IsDegenerate::value)
110 {
111 if (CheckBit(constantInterpMask, i))
112 {
113 uint32_t vid;
114 uint32_t adjustedTriIndex;
115 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
116 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
117 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
118 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
119 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
120
121 switch (topo) {
122 case TOP_QUAD_LIST:
123 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
124 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
125 break;
126 case TOP_QUAD_STRIP:
127 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
128 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
129 break;
130 case TOP_TRIANGLE_STRIP:
131 adjustedTriIndex = triIndex;
132 vid = (triIndex & 1)
133 ? tristripProvokingVertex[provokingVertex]
134 : provokingVertex;
135 break;
136 default:
137 adjustedTriIndex = triIndex;
138 vid = provokingVertex;
139 break;
140 }
141
142 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
143
144 for (uint32_t i = 0; i < NumVertsT::value; ++i)
145 {
146 SIMD128::store_ps(pBuffer, attrib[vid]);
147 pBuffer += 4;
148 }
149 }
150 else
151 {
152 pa.AssembleSingle(inputSlot, triIndex, attrib);
153
154 for (uint32_t i = 0; i < NumVertsT::value; ++i)
155 {
156 SIMD128::store_ps(pBuffer, attrib[i]);
157 pBuffer += 4;
158 }
159 }
160 }
161 else
162 {
163 pa.AssembleSingle(inputSlot, triIndex, attrib);
164
165 for (uint32_t i = 0; i < NumVertsT::value; ++i)
166 {
167 SIMD128::store_ps(pBuffer, attrib[i]);
168 pBuffer += 4;
169 }
170 }
171
172 // pad out the attrib buffer to 3 verts to ensure the triangle
173 // interpolation code in the pixel shader works correctly for the
174 // 3 topologies - point, line, tri. This effectively zeros out the
175 // effect of the missing vertices in the triangle interpolation.
176 for (uint32_t v = NumVertsT::value; v < 3; ++v)
177 {
178 SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
179 pBuffer += 4;
180 }
181
182 // check for constant source overrides
183 if (IsSwizzledT::value)
184 {
185 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
186 if (mask)
187 {
188 DWORD comp;
189 while (_BitScanForward(&comp, mask))
190 {
191 mask &= ~(1 << comp);
192
193 float constantValue = 0.0f;
194 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
195 {
196 case SWR_CONSTANT_SOURCE_CONST_0000:
197 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
198 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
199 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
200 break;
201 case SWR_CONSTANT_SOURCE_PRIM_ID:
202 constantValue = *(float*)&primId;
203 break;
204 }
205
206 // apply constant value to all 3 vertices
207 for (uint32_t v = 0; v < 3; ++v)
208 {
209 pAttribStart[comp + v * 4] = constantValue;
210 }
211 }
212 }
213 }
214 }
215 }
216
217 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
218
219 struct ProcessAttributesChooser
220 {
221 typedef PFN_PROCESS_ATTRIBUTES FuncType;
222
223 template <typename... ArgsB>
224 static FuncType GetFunc()
225 {
226 return ProcessAttributes<ArgsB...>;
227 }
228 };
229
230 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
231 {
232 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
233 }
234
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Processes enabled user clip distances. Loads the active clip
237 /// distances from the PA, sets up barycentric equations, and
238 /// stores the results to the output buffer
239 /// @param pa - Primitive Assembly state
240 /// @param primIndex - primitive index to process
241 /// @param clipDistMask - mask of enabled clip distances
242 /// @param pUserClipBuffer - buffer to store results
243 template<uint32_t NumVerts>
244 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
245 {
246 DWORD clipDist;
247 uint32_t clipDistMask = state.clipDistanceMask;
248 while (_BitScanForward(&clipDist, clipDistMask))
249 {
250 clipDistMask &= ~(1 << clipDist);
251 uint32_t clipSlot = clipDist >> 2;
252 uint32_t clipComp = clipDist & 0x3;
253 uint32_t clipAttribSlot = clipSlot == 0 ?
254 state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
255
256 simd4scalar primClipDist[3];
257 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
258
259 for (uint32_t e = 0; e < NumVerts; ++e)
260 {
261 OSALIGNSIMD(float) aVertClipDist[4];
262 SIMD128::store_ps(aVertClipDist, primClipDist[e]);
263 *(pUserClipBuffer++) = aVertClipDist[clipComp];
264 };
265 }
266 }
267
268 INLINE
269 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
270 {
271 vTranspose3x8(dst, src0, src1, src2);
272 }
273
274 INLINE
275 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
276 {
277 vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
278 }
279
280
281 #if KNOB_ENABLE_EARLY_RAST
282
283 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
284 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
285
286
287 template<typename SIMD_T>
288 struct EarlyRastHelper
289 {
290 };
291
292 template<>
293 struct EarlyRastHelper<SIMD256>
294 {
295 static SIMD256::Integer InitShiftCntrl()
296 {
297 return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
298 }
299 };
300
301 #if USE_SIMD16_FRONTEND
302 template<>
303 struct EarlyRastHelper<SIMD512>
304 {
305 static SIMD512::Integer InitShiftCntrl()
306 {
307 return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
308 }
309 };
310
311 #endif
312 //////////////////////////////////////////////////////////////////////////
313 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
314 /// (ER tile) can be rasterized as early as in binner to check if
315 /// they cover any pixels. If not - the triangles can be
316 /// culled in binner.
317 ///
318 /// @param er_bbox - coordinates of ER tile for each triangle
319 /// @param vAi - A coefficients of triangle edges
320 /// @param vBi - B coefficients of triangle edges
321 /// @param vXi - X coordinates of triangle vertices
322 /// @param vYi - Y coordinates of triangle vertices
323 /// @param frontWindingTris - mask indicating CCW/CW triangles
324 /// @param triMask - mask for valid SIMD lanes (triangles)
325 /// @param oneTileMask - defines triangles for ER to work on
326 /// (tris that fit into ER tile)
327 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
328 uint32_t SIMDCALL EarlyRasterizer(
329 SIMDBBOX_T<SIMD_T> &er_bbox,
330 typename SIMD_T::Integer (&vAi)[3],
331 typename SIMD_T::Integer (&vBi)[3],
332 typename SIMD_T::Integer (&vXi)[3],
333 typename SIMD_T::Integer (&vYi)[3],
334 uint32_t cwTrisMask,
335 uint32_t triMask,
336 uint32_t oneTileMask)
337 {
338 // step to pixel center of top-left pixel of the triangle bbox
339 typename SIMD_T::Integer vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
340 vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
341
342 typename SIMD_T::Integer vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
343 vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
344
345 // negate A and B for CW tris
346 typename SIMD_T::Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
347 typename SIMD_T::Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
348 typename SIMD_T::Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
349 typename SIMD_T::Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
350 typename SIMD_T::Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
351 typename SIMD_T::Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
352
353 RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
354
355 typename SIMD_T::Integer vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
356 typename SIMD_T::Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask);
357 typename SIMD_T::Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
358
359 vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
360 vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
361 vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
362 vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
363 vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
364 vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
365
366 // evaluate edge equations at top-left pixel
367 typename SIMD_T::Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
368 typename SIMD_T::Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
369 typename SIMD_T::Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
370
371 typename SIMD_T::Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
372 typename SIMD_T::Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
373 typename SIMD_T::Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
374
375 typename SIMD_T::Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
376 typename SIMD_T::Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
377 typename SIMD_T::Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
378
379 typename SIMD_T::Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
380 typename SIMD_T::Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
381 typename SIMD_T::Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
382
383 typename SIMD_T::Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
384 typename SIMD_T::Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
385 typename SIMD_T::Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
386
387 vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
388 vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
389 vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
390
391 // top left rule
392 typename SIMD_T::Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
393 typename SIMD_T::Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
394 typename SIMD_T::Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
395
396 // vA < 0
397 vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
398 vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
399 vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
400
401 // vA == 0 && vB < 0
402 typename SIMD_T::Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
403 typename SIMD_T::Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
404 typename SIMD_T::Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
405
406 vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
407 vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
408 vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
409
410 vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
411 vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
412 vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
413
414
415 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
416 // Go down
417 // coverage pixel 0
418 typename SIMD_T::Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
419 vMask0 = SIMD_T::and_si(vMask0, vEdge2);
420
421 // coverage pixel 1
422 typename SIMD_T::Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
423 typename SIMD_T::Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
424 typename SIMD_T::Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
425 typename SIMD_T::Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
426 vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
427
428 // coverage pixel 2
429 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
430 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
431 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
432 typename SIMD_T::Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
433 vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
434
435 // coverage pixel 3
436 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
437 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
438 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
439 typename SIMD_T::Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
440 vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
441
442 // One step to the right and then up
443
444 // coverage pixel 4
445 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
446 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
447 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
448 typename SIMD_T::Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
449 vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
450
451 // coverage pixel 5
452 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
453 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
454 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
455 typename SIMD_T::Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
456 vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
457
458 // coverage pixel 6
459 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
460 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
461 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
462 typename SIMD_T::Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
463 vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
464
465 // coverage pixel 7
466 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
467 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
468 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
469 typename SIMD_T::Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
470 vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
471
472 typename SIMD_T::Integer vLit1 = SIMD_T::or_si(vMask0, vMask1);
473 vLit1 = SIMD_T::or_si(vLit1, vMask2);
474 vLit1 = SIMD_T::or_si(vLit1, vMask3);
475 vLit1 = SIMD_T::or_si(vLit1, vMask4);
476 vLit1 = SIMD_T::or_si(vLit1, vMask5);
477 vLit1 = SIMD_T::or_si(vLit1, vMask6);
478 vLit1 = SIMD_T::or_si(vLit1, vMask7);
479
480 // Step to the right and go down again
481
482 // coverage pixel 0
483 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
484 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
485 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
486 vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N);
487 vMask0 = SIMD_T::and_si(vMask0, vEdge2N);
488
489 // coverage pixel 1
490 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
491 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
492 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
493 vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
494 vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
495
496 // coverage pixel 2
497 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
498 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
499 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
500 vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
501 vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
502
503 // coverage pixel 3
504 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
505 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
506 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
507 vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
508 vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
509
510 // And for the last time - to the right and up
511
512 // coverage pixel 4
513 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
514 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
515 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
516 vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
517 vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
518
519 // coverage pixel 5
520 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
521 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
522 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
523 vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
524 vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
525
526 // coverage pixel 6
527 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
528 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
529 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
530 vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
531 vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
532
533 // coverage pixel 7
534 vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
535 vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
536 vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
537 vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
538 vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
539
540 typename SIMD_T::Integer vLit2 = SIMD_T::or_si(vMask0, vMask1);
541 vLit2 = SIMD_T::or_si(vLit2, vMask2);
542 vLit2 = SIMD_T::or_si(vLit2, vMask3);
543 vLit2 = SIMD_T::or_si(vLit2, vMask4);
544 vLit2 = SIMD_T::or_si(vLit2, vMask5);
545 vLit2 = SIMD_T::or_si(vLit2, vMask6);
546 vLit2 = SIMD_T::or_si(vLit2, vMask7);
547
548 typename SIMD_T::Integer vLit = SIMD_T::or_si(vLit1, vLit2);
549
550 #else
551 // Generic algorithm sweeping in row by row order
552 typename SIMD_T::Integer vRowMask[ER_SIMD_TILE_Y_DIM];
553
554 typename SIMD_T::Integer vEdge0N = vEdge0;
555 typename SIMD_T::Integer vEdge1N = vEdge1;
556 typename SIMD_T::Integer vEdge2N = vEdge2;
557
558 for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
559 {
560 // Store edge values at the beginning of the row
561 typename SIMD_T::Integer vRowEdge0 = vEdge0N;
562 typename SIMD_T::Integer vRowEdge1 = vEdge1N;
563 typename SIMD_T::Integer vRowEdge2 = vEdge2N;
564
565 typename SIMD_T::Integer vColMask[ER_SIMD_TILE_X_DIM];
566
567 for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
568 {
569 vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
570 vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
571
572 vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
573 vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
574 vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
575 }
576 vRowMask[row] = vColMask[0];
577 for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
578 {
579 vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
580 }
581 // Restore values and go to the next row
582 vEdge0N = vRowEdge0;
583 vEdge1N = vRowEdge1;
584 vEdge2N = vRowEdge2;
585
586 vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
587 vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
588 vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
589 }
590
591 // compress all masks
592 typename SIMD_T::Integer vLit = vRowMask[0];
593 for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
594 {
595 vLit = SIMD_T::or_si(vLit, vRowMask[row]);
596 }
597
598 #endif
599 // Check which triangles has any pixel lit
600 uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
601 uint32_t maskUnlit = ~maskLit & oneTileMask;
602
603 uint32_t oldTriMask = triMask;
604 triMask &= ~maskUnlit;
605
606 if (triMask ^ oldTriMask)
607 {
608 RDTSC_EVENT(FEEarlyRastExit, _mm_popcnt_u32(triMask & oneTileMask), 0);
609 }
610 return triMask;
611 }
612
613 #endif // Early rasterizer
614
615 //////////////////////////////////////////////////////////////////////////
616 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
617 /// culling, viewport transform, etc.
618 /// @param pDC - pointer to draw context.
619 /// @param pa - The primitive assembly object.
620 /// @param workerId - thread's worker id. Even thread has a unique id.
621 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
622 /// @param primID - Primitive ID for each triangle.
623 /// @param viewportIdx - viewport array index for each triangle.
624 /// @tparam CT - ConservativeRastFETraits
625 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
626 void SIMDCALL BinTrianglesImpl(
627 DRAW_CONTEXT *pDC,
628 PA_STATE &pa,
629 uint32_t workerId,
630 typename SIMD_T::Vec4 tri[3],
631 uint32_t triMask,
632 typename SIMD_T::Integer const &primID,
633 typename SIMD_T::Integer const &viewportIdx,
634 typename SIMD_T::Integer const &rtIdx)
635 {
636 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
637
638 RDTSC_BEGIN(FEBinTriangles, pDC->drawId);
639
640 const API_STATE& state = GetApiState(pDC);
641 const SWR_RASTSTATE& rastState = state.rastState;
642 const SWR_FRONTEND_STATE& feState = state.frontendState;
643
644 MacroTileMgr *pTileMgr = pDC->pTileMgr;
645
646 typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
647 typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
648 typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
649
650 if (feState.vpTransformDisable)
651 {
652 // RHW is passed in directly when VP transform is disabled
653 vRecipW0 = tri[0].v[3];
654 vRecipW1 = tri[1].v[3];
655 vRecipW2 = tri[2].v[3];
656 }
657 else
658 {
659 // Perspective divide
660 vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
661 vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
662 vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
663
664 tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
665 tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
666 tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
667
668 tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
669 tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
670 tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
671
672 tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
673 tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
674 tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
675
676 // Viewport transform to screen space coords
677 if (pa.viewportArrayActive)
678 {
679 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
680 }
681 else
682 {
683 viewportTransform<3>(tri, state.vpMatrices);
684 }
685 }
686
687 // Adjust for pixel center location
688 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
689
690 tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
691 tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
692
693 tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
694 tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
695
696 tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
697 tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
698
699 // Set vXi, vYi to required fixed point precision
700 typename SIMD_T::Integer vXi[3], vYi[3];
701 FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
702
703 // triangle setup
704 typename SIMD_T::Integer vAi[3], vBi[3];
705 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
706
707 // determinant
708 typename SIMD_T::Integer vDet[2];
709 calcDeterminantIntVertical(vAi, vBi, vDet);
710
711 // cull zero area
712 uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
713 uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
714
715 uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
716
717 // don't cull degenerate triangles if we're conservatively rasterizing
718 uint32_t origTriMask = triMask;
719 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
720 {
721 triMask &= ~cullZeroAreaMask;
722 }
723
724 // determine front winding tris
725 // CW +det
726 // CCW det < 0;
727 // 0 area triangles are marked as backfacing regardless of winding order,
728 // which is required behavior for conservative rast and wireframe rendering
729 uint32_t frontWindingTris;
730 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
731 {
732 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
733 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
734 }
735 else
736 {
737 maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
738 maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
739 }
740 frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
741
742 // cull
743 uint32_t cullTris;
744 switch ((SWR_CULLMODE)rastState.cullMode)
745 {
746 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
747 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
748 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
749 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
750 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
751 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
752 }
753
754 triMask &= ~cullTris;
755
756 if (origTriMask ^ triMask)
757 {
758 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
759 }
760
761 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
762 // compute per tri backface
763 uint32_t frontFaceMask = frontWindingTris;
764 uint32_t *pPrimID = (uint32_t *)&primID;
765 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
766 DWORD triIndex = 0;
767
768 uint32_t edgeEnable;
769 PFN_WORK_FUNC pfnWork;
770 if (CT::IsConservativeT::value)
771 {
772 // determine which edges of the degenerate tri, if any, are valid to rasterize.
773 // used to call the appropriate templated rasterizer function
774 if (cullZeroAreaMask > 0)
775 {
776 // e0 = v1-v0
777 const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
778 const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
779
780 uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
781
782 // e1 = v2-v1
783 const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
784 const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
785
786 uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
787
788 // e2 = v0-v2
789 // if v0 == v1 & v1 == v2, v0 == v2
790 uint32_t e2Mask = e0Mask & e1Mask;
791 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
792
793 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
794 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
795 e0Mask = pdep_u32(e0Mask, 0x00249249);
796
797 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
798 e1Mask = pdep_u32(e1Mask, 0x00492492);
799
800 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
801 e2Mask = pdep_u32(e2Mask, 0x00924924);
802
803 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
804 }
805 else
806 {
807 edgeEnable = 0x00FFFFFF;
808 }
809 }
810 else
811 {
812 // degenerate triangles won't be sent to rasterizer; just enable all edges
813 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
814 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
815 }
816
817 SIMDBBOX_T<SIMD_T> bbox;
818
819 if (!triMask)
820 {
821 goto endBinTriangles;
822 }
823
824 // Calc bounding box of triangles
825 calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
826
827 // determine if triangle falls between pixel centers and discard
828 // only discard for non-MSAA case and when conservative rast is disabled
829 // (xmin + 127) & ~255
830 // (xmax + 128) & ~255
831 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
832 (!CT::IsConservativeT::value))
833 {
834 origTriMask = triMask;
835
836 int cullCenterMask;
837
838 {
839 typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
840 xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
841 typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
842 xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
843
844 typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
845
846 typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
847 ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
848 typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
849 ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
850
851 typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
852
853 vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
854 cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
855 }
856
857 triMask &= ~cullCenterMask;
858
859 if (origTriMask ^ triMask)
860 {
861 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
862 }
863 }
864
865 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
866 // Gather the AOS effective scissor rects based on the per-prim VP index.
867 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
868 {
869 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
870 if (pa.viewportArrayActive)
871
872 {
873 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
874 }
875 else // broadcast fast path for non-VPAI case.
876 {
877 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
878 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
879 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
880 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
881 }
882
883 // Make triangle bbox inclusive
884 bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
885 bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
886
887 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
888 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
889 bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
890 bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
891 }
892
893 if (CT::IsConservativeT::value)
894 {
895 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
896 // some area. Bump the xmax/ymax edges out
897
898 typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
899 bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
900
901 typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
902 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
903 }
904
905 // Cull tris completely outside scissor
906 {
907 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
908 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
909 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
910 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
911 triMask = triMask & ~maskOutsideScissor;
912 }
913
914 #if KNOB_ENABLE_EARLY_RAST
915 if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
916 {
917 // Try early rasterization - culling small triangles which do not cover any pixels
918
919 // convert to ER tiles
920 SIMDBBOX_T<SIMD_T> er_bbox;
921
922 er_bbox.xmin = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
923 er_bbox.xmax = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
924 er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
925 er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
926
927 typename SIMD_T::Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
928 typename SIMD_T::Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
929
930 // Take only triangles that fit into ER tile
931 uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
932
933 if (oneTileMask)
934 {
935 // determine CW tris (det > 0)
936 uint32_t maskCwLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
937 uint32_t maskCwHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
938 uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
939
940 // Try early rasterization
941 triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
942
943 if (!triMask)
944 {
945 RDTSC_END(FEBinTriangles, 1);
946 return;
947 }
948 }
949
950 }
951 #endif
952
953 endBinTriangles:
954
955
956 // Send surviving triangles to the line or point binner based on fill mode
957 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
958 {
959 // Simple non-conformant wireframe mode, useful for debugging
960 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
961 typename SIMD_T::Vec4 line[2];
962 typename SIMD_T::Float recipW[2];
963
964 line[0] = tri[0];
965 line[1] = tri[1];
966 recipW[0] = vRecipW0;
967 recipW[1] = vRecipW1;
968
969 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
970
971 line[0] = tri[1];
972 line[1] = tri[2];
973 recipW[0] = vRecipW1;
974 recipW[1] = vRecipW2;
975
976 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
977
978 line[0] = tri[2];
979 line[1] = tri[0];
980 recipW[0] = vRecipW2;
981 recipW[1] = vRecipW0;
982
983 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
984
985 RDTSC_END(FEBinTriangles, 1);
986 return;
987 }
988 else if (rastState.fillMode == SWR_FILLMODE_POINT)
989 {
990 // Bin 3 points
991 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
992 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
993 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
994
995 RDTSC_END(FEBinTriangles, 1);
996 return;
997 }
998
999 // Convert triangle bbox to macrotile units.
1000 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1001 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1002 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1003 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1004
1005 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1006
1007 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1008 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1009 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1010 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1011
1012 // transpose verts needed for backend
1013 /// @todo modify BE to take non-transformed verts
1014 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1015 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1016 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1017 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1018
1019 TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1020 TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1021 TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1022 TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1023
1024 // scan remaining valid triangles and bin each separately
1025 while (_BitScanForward(&triIndex, triMask))
1026 {
1027 uint32_t linkageCount = state.backendState.numAttributes;
1028 uint32_t numScalarAttribs = linkageCount * 4;
1029
1030 BE_WORK work;
1031 work.type = DRAW;
1032
1033 bool isDegenerate;
1034 if (CT::IsConservativeT::value)
1035 {
1036 // only rasterize valid edges if we have a degenerate primitive
1037 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1038 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1039 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1040
1041 // Degenerate triangles are required to be constant interpolated
1042 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1043 }
1044 else
1045 {
1046 isDegenerate = false;
1047 work.pfnWork = pfnWork;
1048 }
1049
1050 // Select attribute processor
1051 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1052 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1053
1054 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1055
1056 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1057 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1058 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1059
1060 auto pArena = pDC->pArena;
1061 SWR_ASSERT(pArena != nullptr);
1062
1063 // store active attribs
1064 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1065 desc.pAttribs = pAttribs;
1066 desc.numAttribs = linkageCount;
1067 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1068
1069 // store triangle vertex data
1070 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1071
1072 SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1073 SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1074 SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1075 SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1076
1077 // store user clip distances
1078 if (state.backendState.clipDistanceMask)
1079 {
1080 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1081 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1082 ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1083 }
1084
1085 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1086 {
1087 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1088 {
1089 #if KNOB_ENABLE_TOSS_POINTS
1090 if (!KNOB_TOSS_SETUP_TRIS)
1091 #endif
1092 {
1093 pTileMgr->enqueue(x, y, &work);
1094 }
1095 }
1096 }
1097
1098 triMask &= ~(1 << triIndex);
1099 }
1100
1101 RDTSC_END(FEBinTriangles, 1);
1102 }
1103
1104 template <typename CT>
1105 void BinTriangles(
1106 DRAW_CONTEXT *pDC,
1107 PA_STATE &pa,
1108 uint32_t workerId,
1109 simdvector tri[3],
1110 uint32_t triMask,
1111 simdscalari const &primID,
1112 simdscalari const &viewportIdx,
1113 simdscalari const &rtIdx)
1114 {
1115 BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1116 }
1117
1118 #if USE_SIMD16_FRONTEND
1119 template <typename CT>
1120 void SIMDCALL BinTriangles_simd16(
1121 DRAW_CONTEXT *pDC,
1122 PA_STATE &pa,
1123 uint32_t workerId,
1124 simd16vector tri[3],
1125 uint32_t triMask,
1126 simd16scalari const &primID,
1127 simd16scalari const &viewportIdx,
1128 simd16scalari const &rtIdx)
1129 {
1130 BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1131 }
1132
1133 #endif
1134 struct FEBinTrianglesChooser
1135 {
1136 typedef PFN_PROCESS_PRIMS FuncType;
1137
1138 template <typename... ArgsB>
1139 static FuncType GetFunc()
1140 {
1141 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1142 }
1143 };
1144
1145 // Selector for correct templated BinTrinagles function
1146 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1147 {
1148 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1149 }
1150
1151 #if USE_SIMD16_FRONTEND
1152 struct FEBinTrianglesChooser_simd16
1153 {
1154 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1155
1156 template <typename... ArgsB>
1157 static FuncType GetFunc()
1158 {
1159 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1160 }
1161 };
1162
1163 // Selector for correct templated BinTrinagles function
1164 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1165 {
1166 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1167 }
1168
1169 #endif
1170
1171 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1172 void BinPostSetupPointsImpl(
1173 DRAW_CONTEXT *pDC,
1174 PA_STATE &pa,
1175 uint32_t workerId,
1176 typename SIMD_T::Vec4 prim[],
1177 uint32_t primMask,
1178 typename SIMD_T::Integer const &primID,
1179 typename SIMD_T::Integer const &viewportIdx,
1180 typename SIMD_T::Integer const &rtIdx)
1181 {
1182 RDTSC_BEGIN(FEBinPoints, pDC->drawId);
1183
1184 typename SIMD_T::Vec4 &primVerts = prim[0];
1185
1186 const API_STATE& state = GetApiState(pDC);
1187 const SWR_RASTSTATE& rastState = state.rastState;
1188 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1189
1190 // Select attribute processor
1191 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1192 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1193
1194 // convert to fixed point
1195 typename SIMD_T::Integer vXi, vYi;
1196
1197 vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1198 vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1199
1200 if (CanUseSimplePoints(pDC))
1201 {
1202 // adjust for ymin-xmin rule
1203 vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1204 vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1205
1206 // cull points off the ymin-xmin edge of the viewport
1207 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1208 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1209
1210 // compute macro tile coordinates
1211 typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1212 typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1213
1214 OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1215
1216 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1217 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1218
1219 // compute raster tile coordinates
1220 typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1221 typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1222
1223 // compute raster tile relative x,y for coverage mask
1224 typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1225 typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1226
1227 typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1228 typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1229
1230 OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1231 OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1232
1233 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1234 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1235
1236 OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1237 OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1238
1239 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1240 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1241
1242 OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1243 SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1244
1245 // store render target array index
1246 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1247
1248 uint32_t *pPrimID = (uint32_t *)&primID;
1249 DWORD primIndex = 0;
1250
1251 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1252
1253 // scan remaining valid triangles and bin each separately
1254 while (_BitScanForward(&primIndex, primMask))
1255 {
1256 uint32_t linkageCount = backendState.numAttributes;
1257 uint32_t numScalarAttribs = linkageCount * 4;
1258
1259 BE_WORK work;
1260 work.type = DRAW;
1261
1262 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1263
1264 // points are always front facing
1265 desc.triFlags.frontFacing = 1;
1266 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1267 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1268
1269 work.pfnWork = RasterizeSimplePoint;
1270
1271 auto pArena = pDC->pArena;
1272 SWR_ASSERT(pArena != nullptr);
1273
1274 // store attributes
1275 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1276 desc.pAttribs = pAttribs;
1277 desc.numAttribs = linkageCount;
1278
1279 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1280
1281 // store raster tile aligned x, y, perspective correct z
1282 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1283 desc.pTriBuffer = pTriBuffer;
1284 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1285 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1286 *pTriBuffer = aZ[primIndex];
1287
1288 uint32_t tX = aTileRelativeX[primIndex];
1289 uint32_t tY = aTileRelativeY[primIndex];
1290
1291 // pack the relative x,y into the coverageMask, the rasterizer will
1292 // generate the true coverage mask from it
1293 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1294
1295 // bin it
1296 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1297 #if KNOB_ENABLE_TOSS_POINTS
1298 if (!KNOB_TOSS_SETUP_TRIS)
1299 #endif
1300 {
1301 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1302 }
1303
1304 primMask &= ~(1 << primIndex);
1305 }
1306 }
1307 else
1308 {
1309 // non simple points need to be potentially binned to multiple macro tiles
1310 typename SIMD_T::Float vPointSize;
1311
1312 if (rastState.pointParam)
1313 {
1314 typename SIMD_T::Vec4 size[3];
1315 pa.Assemble(VERTEX_SGV_SLOT, size);
1316 vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1317 }
1318 else
1319 {
1320 vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1321 }
1322
1323 // bloat point to bbox
1324 SIMDBBOX_T<SIMD_T> bbox;
1325
1326 bbox.xmin = bbox.xmax = vXi;
1327 bbox.ymin = bbox.ymax = vYi;
1328
1329 typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1330 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1331
1332 bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1333 bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1334 bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1335 bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1336
1337 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1338 // Gather the AOS effective scissor rects based on the per-prim VP index.
1339 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1340 {
1341 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1342
1343 if (pa.viewportArrayActive)
1344 {
1345 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1346 }
1347 else // broadcast fast path for non-VPAI case.
1348 {
1349 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1350 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1351 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1352 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1353 }
1354
1355 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1356 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1357 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1358 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1359 }
1360
1361 // Cull bloated points completely outside scissor
1362 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1363 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1364 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1365 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1366 primMask = primMask & ~maskOutsideScissor;
1367
1368 // Convert bbox to macrotile units.
1369 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1370 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1371 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1372 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1373
1374 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1375
1376 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1377 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1378 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1379 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1380
1381 // store render target array index
1382 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1383
1384 OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1385 SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1386
1387 uint32_t *pPrimID = (uint32_t *)&primID;
1388
1389 OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1390 OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1391 OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1392
1393 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1394 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1395 SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1396
1397 // scan remaining valid prims and bin each separately
1398 const SWR_BACKEND_STATE& backendState = state.backendState;
1399 DWORD primIndex;
1400 while (_BitScanForward(&primIndex, primMask))
1401 {
1402 uint32_t linkageCount = backendState.numAttributes;
1403 uint32_t numScalarAttribs = linkageCount * 4;
1404
1405 BE_WORK work;
1406 work.type = DRAW;
1407
1408 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1409
1410 desc.triFlags.frontFacing = 1;
1411 desc.triFlags.pointSize = aPointSize[primIndex];
1412 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1413 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1414
1415 work.pfnWork = RasterizeTriPoint;
1416
1417 auto pArena = pDC->pArena;
1418 SWR_ASSERT(pArena != nullptr);
1419
1420 // store active attribs
1421 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1422 desc.numAttribs = linkageCount;
1423 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1424
1425 // store point vertex data
1426 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1427 desc.pTriBuffer = pTriBuffer;
1428 *pTriBuffer++ = aPrimVertsX[primIndex];
1429 *pTriBuffer++ = aPrimVertsY[primIndex];
1430 *pTriBuffer = aPrimVertsZ[primIndex];
1431
1432 // store user clip distances
1433 if (backendState.clipDistanceMask)
1434 {
1435 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1436 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1437 float dists[8];
1438 float one = 1.0f;
1439 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1440 for (uint32_t i = 0; i < numClipDist; i++) {
1441 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1442 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1443 desc.pUserClipBuffer[3 * i + 2] = dists[i];
1444 }
1445 }
1446
1447 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1448 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1449 {
1450 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1451 {
1452 #if KNOB_ENABLE_TOSS_POINTS
1453 if (!KNOB_TOSS_SETUP_TRIS)
1454 #endif
1455 {
1456 pTileMgr->enqueue(x, y, &work);
1457 }
1458 }
1459 }
1460
1461 primMask &= ~(1 << primIndex);
1462 }
1463 }
1464
1465 RDTSC_END(FEBinPoints, 1);
1466 }
1467
1468 //////////////////////////////////////////////////////////////////////////
1469 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1470 /// @param pDC - pointer to draw context.
1471 /// @param pa - The primitive assembly object.
1472 /// @param workerId - thread's worker id. Even thread has a unique id.
1473 /// @param tri - Contains point position data for SIMDs worth of points.
1474 /// @param primID - Primitive ID for each point.
1475 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1476 void BinPointsImpl(
1477 DRAW_CONTEXT *pDC,
1478 PA_STATE &pa,
1479 uint32_t workerId,
1480 typename SIMD_T::Vec4 prim[3],
1481 uint32_t primMask,
1482 typename SIMD_T::Integer const &primID,
1483 typename SIMD_T::Integer const &viewportIdx,
1484 typename SIMD_T::Integer const &rtIdx)
1485 {
1486 const API_STATE& state = GetApiState(pDC);
1487 const SWR_FRONTEND_STATE& feState = state.frontendState;
1488 const SWR_RASTSTATE& rastState = state.rastState;
1489
1490 if (!feState.vpTransformDisable)
1491 {
1492 // perspective divide
1493 typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1494
1495 prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1496 prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1497 prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1498
1499 // viewport transform to screen coords
1500 if (pa.viewportArrayActive)
1501 {
1502 viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1503 }
1504 else
1505 {
1506 viewportTransform<1>(prim, state.vpMatrices);
1507 }
1508 }
1509
1510 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1511
1512 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1513 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1514
1515 BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1516 pDC,
1517 pa,
1518 workerId,
1519 prim,
1520 primMask,
1521 primID,
1522 viewportIdx,
1523 rtIdx);
1524 }
1525
1526 void BinPoints(
1527 DRAW_CONTEXT *pDC,
1528 PA_STATE &pa,
1529 uint32_t workerId,
1530 simdvector prim[3],
1531 uint32_t primMask,
1532 simdscalari const &primID,
1533 simdscalari const &viewportIdx,
1534 simdscalari const &rtIdx)
1535 {
1536 BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1537 pDC,
1538 pa,
1539 workerId,
1540 prim,
1541 primMask,
1542 primID,
1543 viewportIdx,
1544 rtIdx);
1545 }
1546
1547 #if USE_SIMD16_FRONTEND
1548 void SIMDCALL BinPoints_simd16(
1549 DRAW_CONTEXT *pDC,
1550 PA_STATE &pa,
1551 uint32_t workerId,
1552 simd16vector prim[3],
1553 uint32_t primMask,
1554 simd16scalari const &primID,
1555 simd16scalari const &viewportIdx,
1556 simd16scalari const & rtIdx)
1557 {
1558 BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1559 pDC,
1560 pa,
1561 workerId,
1562 prim,
1563 primMask,
1564 primID,
1565 viewportIdx,
1566 rtIdx);
1567 }
1568
1569 #endif
1570 //////////////////////////////////////////////////////////////////////////
1571 /// @brief Bin SIMD lines to the backend.
1572 /// @param pDC - pointer to draw context.
1573 /// @param pa - The primitive assembly object.
1574 /// @param workerId - thread's worker id. Even thread has a unique id.
1575 /// @param tri - Contains line position data for SIMDs worth of points.
1576 /// @param primID - Primitive ID for each line.
1577 /// @param viewportIdx - Viewport Array Index for each line.
1578 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1579 void BinPostSetupLinesImpl(
1580 DRAW_CONTEXT *pDC,
1581 PA_STATE &pa,
1582 uint32_t workerId,
1583 typename SIMD_T::Vec4 prim[],
1584 typename SIMD_T::Float recipW[],
1585 uint32_t primMask,
1586 typename SIMD_T::Integer const &primID,
1587 typename SIMD_T::Integer const &viewportIdx,
1588 typename SIMD_T::Integer const &rtIdx)
1589 {
1590 const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1591
1592 RDTSC_BEGIN(FEBinLines, pDC->drawId);
1593
1594 const API_STATE &state = GetApiState(pDC);
1595 const SWR_RASTSTATE &rastState = state.rastState;
1596
1597 // Select attribute processor
1598 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1599 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1600
1601 typename SIMD_T::Float &vRecipW0 = recipW[0];
1602 typename SIMD_T::Float &vRecipW1 = recipW[1];
1603
1604 // convert to fixed point
1605 typename SIMD_T::Integer vXi[2], vYi[2];
1606
1607 vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1608 vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1609 vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1610 vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1611
1612 // compute x-major vs y-major mask
1613 typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1614 typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1615 typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1616 uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1617
1618 // cull zero-length lines
1619 typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1620 vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1621
1622 primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1623
1624 uint32_t *pPrimID = (uint32_t *)&primID;
1625 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1626
1627 // Calc bounding box of lines
1628 SIMDBBOX_T<SIMD_T> bbox;
1629 bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1630 bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1631 bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1632 bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1633
1634 // bloat bbox by line width along minor axis
1635 typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1636 typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1637
1638 SIMDBBOX_T<SIMD_T> bloatBox;
1639
1640 bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1641 bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1642 bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1643 bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1644
1645 bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1646 bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1647 bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1648 bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1649
1650 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1651 {
1652 typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1653
1654 if (pa.viewportArrayActive)
1655 {
1656 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1657 }
1658 else // broadcast fast path for non-VPAI case.
1659 {
1660 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1661 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1662 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1663 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1664 }
1665
1666 bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1667 bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1668 bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1669 bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1670 }
1671
1672 // Cull prims completely outside scissor
1673 {
1674 typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1675 typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1676 typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1677 uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1678 primMask = primMask & ~maskOutsideScissor;
1679 }
1680
1681 // transpose verts needed for backend
1682 /// @todo modify BE to take non-transformed verts
1683 OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1684 OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1685 OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1686 OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1687
1688 if (!primMask)
1689 {
1690 goto endBinLines;
1691 }
1692
1693 // Convert triangle bbox to macrotile units.
1694 bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1695 bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1696 bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1697 bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1698
1699 OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1700
1701 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
1702 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
1703 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
1704 SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1705
1706 TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1707 TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1708 TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1709 TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
1710
1711 // scan remaining valid prims and bin each separately
1712 DWORD primIndex;
1713 while (_BitScanForward(&primIndex, primMask))
1714 {
1715 uint32_t linkageCount = state.backendState.numAttributes;
1716 uint32_t numScalarAttribs = linkageCount * 4;
1717
1718 BE_WORK work;
1719 work.type = DRAW;
1720
1721 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1722
1723 desc.triFlags.frontFacing = 1;
1724 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1725 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1726 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1727
1728 work.pfnWork = RasterizeLine;
1729
1730 auto pArena = pDC->pArena;
1731 SWR_ASSERT(pArena != nullptr);
1732
1733 // store active attribs
1734 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1735 desc.numAttribs = linkageCount;
1736 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1737
1738 // store line vertex data
1739 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1740
1741 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1742 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1743 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1744 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1745
1746 // store user clip distances
1747 if (state.backendState.clipDistanceMask)
1748 {
1749 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1750 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1751 ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1752 }
1753
1754 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1755 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1756 {
1757 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1758 {
1759 #if KNOB_ENABLE_TOSS_POINTS
1760 if (!KNOB_TOSS_SETUP_TRIS)
1761 #endif
1762 {
1763 pTileMgr->enqueue(x, y, &work);
1764 }
1765 }
1766 }
1767
1768 primMask &= ~(1 << primIndex);
1769 }
1770
1771 endBinLines:
1772
1773 RDTSC_END(FEBinLines, 1);
1774 }
1775
1776 //////////////////////////////////////////////////////////////////////////
1777 /// @brief Bin SIMD lines to the backend.
1778 /// @param pDC - pointer to draw context.
1779 /// @param pa - The primitive assembly object.
1780 /// @param workerId - thread's worker id. Even thread has a unique id.
1781 /// @param tri - Contains line position data for SIMDs worth of points.
1782 /// @param primID - Primitive ID for each line.
1783 /// @param viewportIdx - Viewport Array Index for each line.
1784 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1785 void SIMDCALL BinLinesImpl(
1786 DRAW_CONTEXT *pDC,
1787 PA_STATE &pa,
1788 uint32_t workerId,
1789 typename SIMD_T::Vec4 prim[3],
1790 uint32_t primMask,
1791 typename SIMD_T::Integer const &primID,
1792 typename SIMD_T::Integer const &viewportIdx,
1793 typename SIMD_T::Integer const & rtIdx)
1794 {
1795 const API_STATE& state = GetApiState(pDC);
1796 const SWR_RASTSTATE& rastState = state.rastState;
1797 const SWR_FRONTEND_STATE& feState = state.frontendState;
1798
1799 typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1800
1801 if (!feState.vpTransformDisable)
1802 {
1803 // perspective divide
1804 vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1805 vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1806
1807 prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1808 prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1809
1810 prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1811 prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1812
1813 prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1814 prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1815
1816 // viewport transform to screen coords
1817 if (pa.viewportArrayActive)
1818 {
1819 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1820 }
1821 else
1822 {
1823 viewportTransform<2>(prim, state.vpMatrices);
1824 }
1825 }
1826
1827 // adjust for pixel center location
1828 typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1829
1830 prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1831 prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1832
1833 prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1834 prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1835
1836 BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1837 pDC,
1838 pa,
1839 workerId,
1840 prim,
1841 vRecipW,
1842 primMask,
1843 primID,
1844 viewportIdx,
1845 rtIdx);
1846 }
1847
1848 void BinLines(
1849 DRAW_CONTEXT *pDC,
1850 PA_STATE &pa,
1851 uint32_t workerId,
1852 simdvector prim[],
1853 uint32_t primMask,
1854 simdscalari const &primID,
1855 simdscalari const &viewportIdx,
1856 simdscalari const &rtIdx)
1857 {
1858 BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1859 }
1860
1861 #if USE_SIMD16_FRONTEND
1862 void SIMDCALL BinLines_simd16(
1863 DRAW_CONTEXT *pDC,
1864 PA_STATE &pa,
1865 uint32_t workerId,
1866 simd16vector prim[3],
1867 uint32_t primMask,
1868 simd16scalari const &primID,
1869 simd16scalari const &viewportIdx,
1870 simd16scalari const &rtIdx)
1871 {
1872 BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1873 }
1874
1875 #endif