swr/rast: move construction of const above goto
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "context.h"
30 #include "frontend.h"
31 #include "conservativeRast.h"
32 #include "pa.h"
33 #include "rasterizer.h"
34 #include "rdtsc_core.h"
35 #include "tilemgr.h"
36
37 // Function Prototype
38 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
39
40 #if USE_SIMD16_FRONTEND
41 void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
42 #endif
43
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Offsets added to post-viewport vertex positions based on
46 /// raster state.
47 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
48 {
49 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
50 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
51 };
52
53 #if USE_SIMD16_FRONTEND
54 static const simd16scalar g_pixelOffsets_simd16[SWR_PIXEL_LOCATION_UL + 1] =
55 {
56 _simd16_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
57 _simd16_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
58 };
59
60 #endif
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
63 /// Point precision from FP32.
64 template <typename PT = FixedPointTraits<Fixed_16_8>>
65 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
66 {
67 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
68 return _simd_cvtps_epi32(vFixed);
69 }
70
71 #if USE_SIMD16_FRONTEND
72 template <typename PT = FixedPointTraits<Fixed_16_8>>
73 INLINE simd16scalari fpToFixedPointVertical(const simd16scalar vIn)
74 {
75 simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value));
76 return _simd16_cvtps_epi32(vFixed);
77 }
78
79 #endif
80 //////////////////////////////////////////////////////////////////////////
81 /// @brief Helper function to set the X,Y coords of a triangle to the
82 /// requested Fixed Point precision from FP32.
83 /// @param tri: simdvector[3] of FP triangle verts
84 /// @param vXi: fixed point X coords of tri verts
85 /// @param vYi: fixed point Y coords of tri verts
86 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3])
87 {
88 vXi[0] = fpToFixedPointVertical(tri[0].x);
89 vYi[0] = fpToFixedPointVertical(tri[0].y);
90 vXi[1] = fpToFixedPointVertical(tri[1].x);
91 vYi[1] = fpToFixedPointVertical(tri[1].y);
92 vXi[2] = fpToFixedPointVertical(tri[2].x);
93 vYi[2] = fpToFixedPointVertical(tri[2].y);
94 }
95
96 #if USE_SIMD16_FRONTEND
97 INLINE static void FPToFixedPoint(const simd16vector * const tri, simd16scalari(&vXi)[3], simd16scalari(&vYi)[3])
98 {
99 vXi[0] = fpToFixedPointVertical(tri[0].x);
100 vYi[0] = fpToFixedPointVertical(tri[0].y);
101 vXi[1] = fpToFixedPointVertical(tri[1].x);
102 vYi[1] = fpToFixedPointVertical(tri[1].y);
103 vXi[2] = fpToFixedPointVertical(tri[2].x);
104 vYi[2] = fpToFixedPointVertical(tri[2].y);
105 }
106
107 #endif
108 //////////////////////////////////////////////////////////////////////////
109 /// @brief Calculate bounding box for current triangle
110 /// @tparam CT: ConservativeRastFETraits type
111 /// @param vX: fixed point X position for triangle verts
112 /// @param vY: fixed point Y position for triangle verts
113 /// @param bbox: fixed point bbox
114 /// *Note*: expects vX, vY to be in the correct precision for the type
115 /// of rasterization. This avoids unnecessary FP->fixed conversions.
116 template <typename CT>
117 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
118 {
119 simdscalari vMinX = vX[0];
120 vMinX = _simd_min_epi32(vMinX, vX[1]);
121 vMinX = _simd_min_epi32(vMinX, vX[2]);
122
123 simdscalari vMaxX = vX[0];
124 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
125 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
126
127 simdscalari vMinY = vY[0];
128 vMinY = _simd_min_epi32(vMinY, vY[1]);
129 vMinY = _simd_min_epi32(vMinY, vY[2]);
130
131 simdscalari vMaxY = vY[0];
132 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
133 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
134
135 bbox.xmin = vMinX;
136 bbox.xmax = vMaxX;
137 bbox.ymin = vMinY;
138 bbox.ymax = vMaxY;
139 }
140
141 #if USE_SIMD16_FRONTEND
142 template <typename CT>
143 INLINE void calcBoundingBoxIntVertical(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox)
144 {
145 simd16scalari vMinX = vX[0];
146
147 vMinX = _simd16_min_epi32(vMinX, vX[1]);
148 vMinX = _simd16_min_epi32(vMinX, vX[2]);
149
150 simd16scalari vMaxX = vX[0];
151
152 vMaxX = _simd16_max_epi32(vMaxX, vX[1]);
153 vMaxX = _simd16_max_epi32(vMaxX, vX[2]);
154
155 simd16scalari vMinY = vY[0];
156
157 vMinY = _simd16_min_epi32(vMinY, vY[1]);
158 vMinY = _simd16_min_epi32(vMinY, vY[2]);
159
160 simd16scalari vMaxY = vY[0];
161
162 vMaxY = _simd16_max_epi32(vMaxY, vY[1]);
163 vMaxY = _simd16_max_epi32(vMaxY, vY[2]);
164
165 bbox.xmin = vMinX;
166 bbox.xmax = vMaxX;
167 bbox.ymin = vMinY;
168 bbox.ymax = vMaxY;
169 }
170
171 #endif
172 //////////////////////////////////////////////////////////////////////////
173 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
174 /// Offsets BBox for conservative rast
175 template <>
176 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
177 {
178 // FE conservative rast traits
179 typedef FEConservativeRastT CT;
180
181 simdscalari vMinX = vX[0];
182 vMinX = _simd_min_epi32(vMinX, vX[1]);
183 vMinX = _simd_min_epi32(vMinX, vX[2]);
184
185 simdscalari vMaxX = vX[0];
186 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
187 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
188
189 simdscalari vMinY = vY[0];
190 vMinY = _simd_min_epi32(vMinY, vY[1]);
191 vMinY = _simd_min_epi32(vMinY, vY[2]);
192
193 simdscalari vMaxY = vY[0];
194 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
195 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
196
197 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
198 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
199 bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
200 bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
201 bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
202 bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
203 }
204
205 //////////////////////////////////////////////////////////////////////////
206 /// @brief Processes attributes for the backend based on linkage mask and
207 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
208 /// @param pDC - Draw context
209 /// @param pa - Primitive Assembly state
210 /// @param linkageMask - Specifies which VS outputs are routed to PS.
211 /// @param pLinkageMap - maps VS attribute slot to PS slot
212 /// @param triIndex - Triangle to process attributes for
213 /// @param pBuffer - Output result
214 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
215 INLINE void ProcessAttributes(
216 DRAW_CONTEXT *pDC,
217 PA_STATE&pa,
218 uint32_t triIndex,
219 uint32_t primId,
220 float *pBuffer)
221 {
222 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
223 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
224 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
225 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
226 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
227 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
228
229 static const float constTable[3][4] = {
230 { 0.0f, 0.0f, 0.0f, 0.0f },
231 { 0.0f, 0.0f, 0.0f, 1.0f },
232 { 1.0f, 1.0f, 1.0f, 1.0f }
233 };
234
235 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
236 {
237 uint32_t inputSlot;
238 if (IsSwizzledT::value)
239 {
240 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
241 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
242
243 }
244 else
245 {
246 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
247 }
248
249 __m128 attrib[3]; // triangle attribs (always 4 wide)
250 float* pAttribStart = pBuffer;
251
252 if (HasConstantInterpT::value || IsDegenerate::value)
253 {
254 if (_bittest(&constantInterpMask, i))
255 {
256 uint32_t vid;
257 uint32_t adjustedTriIndex;
258 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
259 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
260 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
261 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
262 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
263
264 switch (topo) {
265 case TOP_QUAD_LIST:
266 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
267 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
268 break;
269 case TOP_QUAD_STRIP:
270 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
271 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
272 break;
273 case TOP_TRIANGLE_STRIP:
274 adjustedTriIndex = triIndex;
275 vid = (triIndex & 1)
276 ? tristripProvokingVertex[provokingVertex]
277 : provokingVertex;
278 break;
279 default:
280 adjustedTriIndex = triIndex;
281 vid = provokingVertex;
282 break;
283 }
284
285 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
286
287 for (uint32_t i = 0; i < NumVertsT::value; ++i)
288 {
289 _mm_store_ps(pBuffer, attrib[vid]);
290 pBuffer += 4;
291 }
292 }
293 else
294 {
295 pa.AssembleSingle(inputSlot, triIndex, attrib);
296
297 for (uint32_t i = 0; i < NumVertsT::value; ++i)
298 {
299 _mm_store_ps(pBuffer, attrib[i]);
300 pBuffer += 4;
301 }
302 }
303 }
304 else
305 {
306 pa.AssembleSingle(inputSlot, triIndex, attrib);
307
308 for (uint32_t i = 0; i < NumVertsT::value; ++i)
309 {
310 _mm_store_ps(pBuffer, attrib[i]);
311 pBuffer += 4;
312 }
313 }
314
315 // pad out the attrib buffer to 3 verts to ensure the triangle
316 // interpolation code in the pixel shader works correctly for the
317 // 3 topologies - point, line, tri. This effectively zeros out the
318 // effect of the missing vertices in the triangle interpolation.
319 for (uint32_t v = NumVertsT::value; v < 3; ++v)
320 {
321 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
322 pBuffer += 4;
323 }
324
325 // check for constant source overrides
326 if (IsSwizzledT::value)
327 {
328 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
329 if (mask)
330 {
331 DWORD comp;
332 while (_BitScanForward(&comp, mask))
333 {
334 mask &= ~(1 << comp);
335
336 float constantValue = 0.0f;
337 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
338 {
339 case SWR_CONSTANT_SOURCE_CONST_0000:
340 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
341 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
342 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
343 break;
344 case SWR_CONSTANT_SOURCE_PRIM_ID:
345 constantValue = *(float*)&primId;
346 break;
347 }
348
349 // apply constant value to all 3 vertices
350 for (uint32_t v = 0; v < 3; ++v)
351 {
352 pAttribStart[comp + v * 4] = constantValue;
353 }
354 }
355 }
356 }
357 }
358 }
359
360 //////////////////////////////////////////////////////////////////////////
361 /// @brief Gather scissor rect data based on per-prim viewport indices.
362 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
363 /// @param pViewportIndex - array of per-primitive vewport indexes.
364 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
365 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
366 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
367 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
368 //
369 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
370 template<size_t SimdWidth>
371 struct GatherScissors
372 {
373 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
374 simdscalari &scisXmin, simdscalari &scisYmin,
375 simdscalari &scisXmax, simdscalari &scisYmax)
376 {
377 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
378 }
379 };
380
381 template<>
382 struct GatherScissors<8>
383 {
384 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
385 simdscalari &scisXmin, simdscalari &scisYmin,
386 simdscalari &scisXmax, simdscalari &scisYmax)
387 {
388 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
389 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
390 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
391 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
392 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
393 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
394 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
395 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
396 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
397 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
398 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
399 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
400 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
401 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
402 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
403 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
404 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
405 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
406 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
407 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
408 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
409 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
410 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
411 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
412 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
413 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
414 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
415 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
416 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
417 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
418 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
419 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
420 }
421 };
422
423 #if USE_SIMD16_FRONTEND
424 template<size_t SimdWidth>
425 struct GatherScissors_simd16
426 {
427 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
428 simd16scalari &scisXmin, simd16scalari &scisYmin,
429 simd16scalari &scisXmax, simd16scalari &scisYmax)
430 {
431 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
432 }
433 };
434
435 template<>
436 struct GatherScissors_simd16<16>
437 {
438 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
439 simd16scalari &scisXmin, simd16scalari &scisYmin,
440 simd16scalari &scisXmax, simd16scalari &scisYmax)
441 {
442 scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
443 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
444 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
445 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
446 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
447 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
448 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
449 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
450 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
451 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
452 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
453 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
454 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
455 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
456 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
457 pScissorsInFixedPoint[pViewportIndex[15]].xmin);
458
459 scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
460 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
461 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
462 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
463 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
464 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
465 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
466 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
467 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
468 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
469 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
470 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
471 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
472 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
473 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
474 pScissorsInFixedPoint[pViewportIndex[15]].ymin);
475
476 scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
477 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
478 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
479 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
480 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
481 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
482 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
483 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
484 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
485 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
486 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
487 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
488 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
489 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
490 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
491 pScissorsInFixedPoint[pViewportIndex[15]].xmax);
492
493 scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
494 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
495 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
496 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
497 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
498 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
499 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
500 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
501 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
502 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
503 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
504 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
505 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
506 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
507 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
508 pScissorsInFixedPoint[pViewportIndex[15]].ymax);
509 }
510 };
511
512 #endif
513 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
514
515 struct ProcessAttributesChooser
516 {
517 typedef PFN_PROCESS_ATTRIBUTES FuncType;
518
519 template <typename... ArgsB>
520 static FuncType GetFunc()
521 {
522 return ProcessAttributes<ArgsB...>;
523 }
524 };
525
526 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
527 {
528 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
529 }
530
531 //////////////////////////////////////////////////////////////////////////
532 /// @brief Processes enabled user clip distances. Loads the active clip
533 /// distances from the PA, sets up barycentric equations, and
534 /// stores the results to the output buffer
535 /// @param pa - Primitive Assembly state
536 /// @param primIndex - primitive index to process
537 /// @param clipDistMask - mask of enabled clip distances
538 /// @param pUserClipBuffer - buffer to store results
539 template<uint32_t NumVerts>
540 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer)
541 {
542 DWORD clipDist;
543 while (_BitScanForward(&clipDist, clipDistMask))
544 {
545 clipDistMask &= ~(1 << clipDist);
546 uint32_t clipSlot = clipDist >> 2;
547 uint32_t clipComp = clipDist & 0x3;
548 uint32_t clipAttribSlot = clipSlot == 0 ?
549 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
550
551 __m128 primClipDist[3];
552 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
553
554 float vertClipDist[NumVerts];
555 for (uint32_t e = 0; e < NumVerts; ++e)
556 {
557 OSALIGNSIMD(float) aVertClipDist[4];
558 _mm_store_ps(aVertClipDist, primClipDist[e]);
559 vertClipDist[e] = aVertClipDist[clipComp];
560 };
561
562 // setup plane equations for barycentric interpolation in the backend
563 float baryCoeff[NumVerts];
564 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
565 for (uint32_t e = 0; e < NumVerts - 1; ++e)
566 {
567 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
568 }
569 baryCoeff[NumVerts - 1] = last;
570
571 for (uint32_t e = 0; e < NumVerts; ++e)
572 {
573 *(pUserClipBuffer++) = baryCoeff[e];
574 }
575 }
576 }
577
578 //////////////////////////////////////////////////////////////////////////
579 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
580 /// culling, viewport transform, etc.
581 /// @param pDC - pointer to draw context.
582 /// @param pa - The primitive assembly object.
583 /// @param workerId - thread's worker id. Even thread has a unique id.
584 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
585 /// @param primID - Primitive ID for each triangle.
586 /// @param viewportIdx - viewport array index for each triangle.
587 /// @tparam CT - ConservativeRastFETraits
588 template <typename CT>
589 void BinTriangles(
590 DRAW_CONTEXT *pDC,
591 PA_STATE& pa,
592 uint32_t workerId,
593 simdvector tri[3],
594 uint32_t triMask,
595 simdscalari primID,
596 simdscalari viewportIdx)
597 {
598 SWR_CONTEXT *pContext = pDC->pContext;
599
600 AR_BEGIN(FEBinTriangles, pDC->drawId);
601
602 const API_STATE& state = GetApiState(pDC);
603 const SWR_RASTSTATE& rastState = state.rastState;
604 const SWR_FRONTEND_STATE& feState = state.frontendState;
605 const SWR_GS_STATE& gsState = state.gsState;
606 MacroTileMgr *pTileMgr = pDC->pTileMgr;
607
608 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
609 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
610 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
611
612 if (feState.vpTransformDisable)
613 {
614 // RHW is passed in directly when VP transform is disabled
615 vRecipW0 = tri[0].v[3];
616 vRecipW1 = tri[1].v[3];
617 vRecipW2 = tri[2].v[3];
618 }
619 else
620 {
621 // Perspective divide
622 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
623 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
624 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
625
626 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
627 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
628 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
629
630 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
631 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
632 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
633
634 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
635 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
636 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
637
638 // Viewport transform to screen space coords
639 if (state.gsState.emitsViewportArrayIndex)
640 {
641 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
642 }
643 else
644 {
645 viewportTransform<3>(tri, state.vpMatrices);
646 }
647 }
648
649 // Adjust for pixel center location
650 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
651 tri[0].x = _simd_add_ps(tri[0].x, offset);
652 tri[0].y = _simd_add_ps(tri[0].y, offset);
653
654 tri[1].x = _simd_add_ps(tri[1].x, offset);
655 tri[1].y = _simd_add_ps(tri[1].y, offset);
656
657 tri[2].x = _simd_add_ps(tri[2].x, offset);
658 tri[2].y = _simd_add_ps(tri[2].y, offset);
659
660 simdscalari vXi[3], vYi[3];
661 // Set vXi, vYi to required fixed point precision
662 FPToFixedPoint(tri, vXi, vYi);
663
664 // triangle setup
665 simdscalari vAi[3], vBi[3];
666 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
667
668 // determinant
669 simdscalari vDet[2];
670 calcDeterminantIntVertical(vAi, vBi, vDet);
671
672 // cull zero area
673 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
674 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
675
676 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
677
678 uint32_t origTriMask = triMask;
679 // don't cull degenerate triangles if we're conservatively rasterizing
680 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
681 {
682 triMask &= ~cullZeroAreaMask;
683 }
684
685 // determine front winding tris
686 // CW +det
687 // CCW det < 0;
688 // 0 area triangles are marked as backfacing regardless of winding order,
689 // which is required behavior for conservative rast and wireframe rendering
690 uint32_t frontWindingTris;
691 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
692 {
693 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
694 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
695 }
696 else
697 {
698 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
699 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
700 }
701 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
702
703 // cull
704 uint32_t cullTris;
705 switch ((SWR_CULLMODE)rastState.cullMode)
706 {
707 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
708 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
709 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
710 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
711 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
712 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
713 }
714
715 triMask &= ~cullTris;
716
717 if (origTriMask ^ triMask)
718 {
719 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
720 }
721
722 // Simple non-conformant wireframe mode, useful for debugging
723 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
724 {
725 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
726 simdvector line[2];
727 simdscalar recipW[2];
728 line[0] = tri[0];
729 line[1] = tri[1];
730 recipW[0] = vRecipW0;
731 recipW[1] = vRecipW1;
732 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
733
734 line[0] = tri[1];
735 line[1] = tri[2];
736 recipW[0] = vRecipW1;
737 recipW[1] = vRecipW2;
738 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
739
740 line[0] = tri[2];
741 line[1] = tri[0];
742 recipW[0] = vRecipW2;
743 recipW[1] = vRecipW0;
744 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
745
746 AR_END(FEBinTriangles, 1);
747 return;
748 }
749
750 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
751 // compute per tri backface
752 uint32_t frontFaceMask = frontWindingTris;
753 uint32_t *pPrimID = (uint32_t *)&primID;
754 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
755 DWORD triIndex = 0;
756 uint32_t edgeEnable;
757 PFN_WORK_FUNC pfnWork;
758 if (CT::IsConservativeT::value)
759 {
760 // determine which edges of the degenerate tri, if any, are valid to rasterize.
761 // used to call the appropriate templated rasterizer function
762 if (cullZeroAreaMask > 0)
763 {
764 // e0 = v1-v0
765 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
766 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
767 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
768
769 // e1 = v2-v1
770 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
771 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
772 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
773
774 // e2 = v0-v2
775 // if v0 == v1 & v1 == v2, v0 == v2
776 uint32_t e2Mask = e0Mask & e1Mask;
777 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
778
779 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
780 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
781 e0Mask = pdep_u32(e0Mask, 0x00249249);
782 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
783 e1Mask = pdep_u32(e1Mask, 0x00492492);
784 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
785 e2Mask = pdep_u32(e2Mask, 0x00924924);
786
787 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
788 }
789 else
790 {
791 edgeEnable = 0x00FFFFFF;
792 }
793 }
794 else
795 {
796 // degenerate triangles won't be sent to rasterizer; just enable all edges
797 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
798 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
799 }
800
801 if (!triMask)
802 {
803 goto endBinTriangles;
804 }
805
806 // Calc bounding box of triangles
807 simdBBox bbox;
808 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
809
810 // determine if triangle falls between pixel centers and discard
811 // only discard for non-MSAA case and when conservative rast is disabled
812 // (xmin + 127) & ~255
813 // (xmax + 128) & ~255
814 if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
815 (!CT::IsConservativeT::value))
816 {
817 origTriMask = triMask;
818
819 int cullCenterMask;
820 {
821 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
822 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
823 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
824 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
825
826 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
827
828 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
829 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
830 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
831 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
832
833 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
834 vMaskV = _simd_or_si(vMaskH, vMaskV);
835 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
836 }
837
838 triMask &= ~cullCenterMask;
839
840 if (origTriMask ^ triMask)
841 {
842 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
843 }
844 }
845
846 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
847 // Gather the AOS effective scissor rects based on the per-prim VP index.
848 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
849 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
850 if (state.gsState.emitsViewportArrayIndex)
851 {
852 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
853 scisXmin, scisYmin, scisXmax, scisYmax);
854 }
855 else // broadcast fast path for non-VPAI case.
856 {
857 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
858 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
859 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
860 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
861 }
862
863 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
864 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
865 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
866 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
867
868 if (CT::IsConservativeT::value)
869 {
870 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
871 // some area. Bump the xmax/ymax edges out
872 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
873 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
874 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
875 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
876 }
877
878 // Cull tris completely outside scissor
879 {
880 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
881 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
882 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
883 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
884 triMask = triMask & ~maskOutsideScissor;
885 }
886
887 if (!triMask)
888 {
889 goto endBinTriangles;
890 }
891
892 // Convert triangle bbox to macrotile units.
893 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
894 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
895 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
896 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
897
898 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
899 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
900 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
901 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
902 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
903
904 // transpose verts needed for backend
905 /// @todo modify BE to take non-transformed verts
906 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
907 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
908 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
909 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
910 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
911
912 // store render target array index
913 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
914 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
915 {
916 simdvector vRtai[3];
917 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
918 simdscalari vRtaii;
919 vRtaii = _simd_castps_si(vRtai[0].x);
920 _simd_store_si((simdscalari*)aRTAI, vRtaii);
921 }
922 else
923 {
924 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
925 }
926
927 endBinTriangles:
928
929 // scan remaining valid triangles and bin each separately
930 while (_BitScanForward(&triIndex, triMask))
931 {
932 uint32_t linkageCount = state.backendState.numAttributes;
933 uint32_t numScalarAttribs = linkageCount * 4;
934
935 BE_WORK work;
936 work.type = DRAW;
937
938 bool isDegenerate;
939 if (CT::IsConservativeT::value)
940 {
941 // only rasterize valid edges if we have a degenerate primitive
942 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
943 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
944 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
945
946 // Degenerate triangles are required to be constant interpolated
947 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
948 }
949 else
950 {
951 isDegenerate = false;
952 work.pfnWork = pfnWork;
953 }
954
955 // Select attribute processor
956 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
957 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
958
959 TRIANGLE_WORK_DESC &desc = work.desc.tri;
960
961 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
962 desc.triFlags.primID = pPrimID[triIndex];
963 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
964 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
965
966 auto pArena = pDC->pArena;
967 SWR_ASSERT(pArena != nullptr);
968
969 // store active attribs
970 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
971 desc.pAttribs = pAttribs;
972 desc.numAttribs = linkageCount;
973 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
974
975 // store triangle vertex data
976 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
977
978 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
979 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
980 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
981 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
982
983 // store user clip distances
984 if (rastState.clipDistanceMask)
985 {
986 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
987 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
988 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
989 }
990
991 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
992 {
993 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
994 {
995 #if KNOB_ENABLE_TOSS_POINTS
996 if (!KNOB_TOSS_SETUP_TRIS)
997 #endif
998 {
999 pTileMgr->enqueue(x, y, &work);
1000 }
1001 }
1002 }
1003 triMask &= ~(1 << triIndex);
1004 }
1005
1006 AR_END(FEBinTriangles, 1);
1007 }
1008
1009 #if USE_SIMD16_FRONTEND
1010 template <typename CT>
1011 void SIMDAPI BinTriangles_simd16(
1012 DRAW_CONTEXT *pDC,
1013 PA_STATE& pa,
1014 uint32_t workerId,
1015 simd16vector tri[3],
1016 uint32_t triMask,
1017 simd16scalari primID,
1018 simd16scalari viewportIdx)
1019 {
1020 SWR_CONTEXT *pContext = pDC->pContext;
1021
1022 AR_BEGIN(FEBinTriangles, pDC->drawId);
1023
1024 const API_STATE& state = GetApiState(pDC);
1025 const SWR_RASTSTATE& rastState = state.rastState;
1026 const SWR_FRONTEND_STATE& feState = state.frontendState;
1027 const SWR_GS_STATE& gsState = state.gsState;
1028
1029 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1030
1031 simd16scalar vRecipW0 = _simd16_set1_ps(1.0f);
1032 simd16scalar vRecipW1 = _simd16_set1_ps(1.0f);
1033 simd16scalar vRecipW2 = _simd16_set1_ps(1.0f);
1034
1035 if (feState.vpTransformDisable)
1036 {
1037 // RHW is passed in directly when VP transform is disabled
1038 vRecipW0 = tri[0].v[3];
1039 vRecipW1 = tri[1].v[3];
1040 vRecipW2 = tri[2].v[3];
1041 }
1042 else
1043 {
1044 // Perspective divide
1045 vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[0].w);
1046 vRecipW1 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[1].w);
1047 vRecipW2 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[2].w);
1048
1049 tri[0].v[0] = _simd16_mul_ps(tri[0].v[0], vRecipW0);
1050 tri[1].v[0] = _simd16_mul_ps(tri[1].v[0], vRecipW1);
1051 tri[2].v[0] = _simd16_mul_ps(tri[2].v[0], vRecipW2);
1052
1053 tri[0].v[1] = _simd16_mul_ps(tri[0].v[1], vRecipW0);
1054 tri[1].v[1] = _simd16_mul_ps(tri[1].v[1], vRecipW1);
1055 tri[2].v[1] = _simd16_mul_ps(tri[2].v[1], vRecipW2);
1056
1057 tri[0].v[2] = _simd16_mul_ps(tri[0].v[2], vRecipW0);
1058 tri[1].v[2] = _simd16_mul_ps(tri[1].v[2], vRecipW1);
1059 tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2);
1060
1061 // Viewport transform to screen space coords
1062 if (state.gsState.emitsViewportArrayIndex)
1063 {
1064 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1065 }
1066 else
1067 {
1068 viewportTransform<3>(tri, state.vpMatrices);
1069 }
1070 }
1071
1072 // Adjust for pixel center location
1073 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
1074
1075 tri[0].x = _simd16_add_ps(tri[0].x, offset);
1076 tri[0].y = _simd16_add_ps(tri[0].y, offset);
1077
1078 tri[1].x = _simd16_add_ps(tri[1].x, offset);
1079 tri[1].y = _simd16_add_ps(tri[1].y, offset);
1080
1081 tri[2].x = _simd16_add_ps(tri[2].x, offset);
1082 tri[2].y = _simd16_add_ps(tri[2].y, offset);
1083
1084 simd16scalari vXi[3], vYi[3];
1085
1086 // Set vXi, vYi to required fixed point precision
1087 FPToFixedPoint(tri, vXi, vYi);
1088
1089 // triangle setup
1090 simd16scalari vAi[3], vBi[3];
1091 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1092
1093 // determinant
1094 simd16scalari vDet[2];
1095 calcDeterminantIntVertical(vAi, vBi, vDet);
1096
1097 // cull zero area
1098 uint32_t maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si())));
1099 uint32_t maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si())));
1100
1101 uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
1102
1103 // don't cull degenerate triangles if we're conservatively rasterizing
1104 uint32_t origTriMask = triMask;
1105 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
1106 {
1107 triMask &= ~cullZeroAreaMask;
1108 }
1109
1110 // determine front winding tris
1111 // CW +det
1112 // CCW det < 0;
1113 // 0 area triangles are marked as backfacing regardless of winding order,
1114 // which is required behavior for conservative rast and wireframe rendering
1115 uint32_t frontWindingTris;
1116 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1117 {
1118 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[0], _simd16_setzero_si())));
1119 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[1], _simd16_setzero_si())));
1120 }
1121 else
1122 {
1123 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0])));
1124 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1])));
1125 }
1126 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
1127
1128 // cull
1129 uint32_t cullTris;
1130 switch ((SWR_CULLMODE)rastState.cullMode)
1131 {
1132 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1133 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1134 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1135 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1136 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1137 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1138 }
1139
1140 triMask &= ~cullTris;
1141
1142 if (origTriMask ^ triMask)
1143 {
1144 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1145 }
1146
1147 // Simple non-conformant wireframe mode, useful for debugging
1148 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
1149 {
1150 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1151 simd16vector line[2];
1152 simd16scalar recipW[2];
1153 line[0] = tri[0];
1154 line[1] = tri[1];
1155 recipW[0] = vRecipW0;
1156 recipW[1] = vRecipW1;
1157 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1158
1159 line[0] = tri[1];
1160 line[1] = tri[2];
1161 recipW[0] = vRecipW1;
1162 recipW[1] = vRecipW2;
1163 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1164
1165 line[0] = tri[2];
1166 line[1] = tri[0];
1167 recipW[0] = vRecipW2;
1168 recipW[1] = vRecipW0;
1169 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1170
1171 AR_END(FEBinTriangles, 1);
1172 return;
1173 }
1174
1175 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1176 // compute per tri backface
1177 uint32_t frontFaceMask = frontWindingTris;
1178 uint32_t *pPrimID = (uint32_t *)&primID;
1179 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1180 DWORD triIndex = 0;
1181
1182 uint32_t edgeEnable;
1183 PFN_WORK_FUNC pfnWork;
1184 if (CT::IsConservativeT::value)
1185 {
1186 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1187 // used to call the appropriate templated rasterizer function
1188 if (cullZeroAreaMask > 0)
1189 {
1190 // e0 = v1-v0
1191 const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]);
1192 const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]);
1193
1194 uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask)));
1195
1196 // e1 = v2-v1
1197 const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]);
1198 const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]);
1199
1200 uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask)));
1201
1202 // e2 = v0-v2
1203 // if v0 == v1 & v1 == v2, v0 == v2
1204 uint32_t e2Mask = e0Mask & e1Mask;
1205 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1206
1207 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1208 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1209 e0Mask = pdep_u32(e0Mask, 0x00249249);
1210
1211 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1212 e1Mask = pdep_u32(e1Mask, 0x00492492);
1213
1214 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1215 e2Mask = pdep_u32(e2Mask, 0x00924924);
1216
1217 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1218 }
1219 else
1220 {
1221 edgeEnable = 0x00FFFFFF;
1222 }
1223 }
1224 else
1225 {
1226 // degenerate triangles won't be sent to rasterizer; just enable all edges
1227 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1228 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
1229 }
1230
1231 if (!triMask)
1232 {
1233 goto endBinTriangles;
1234 }
1235
1236 // Calc bounding box of triangles
1237 simd16BBox bbox;
1238 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1239
1240 // determine if triangle falls between pixel centers and discard
1241 // only discard for non-MSAA case and when conservative rast is disabled
1242 // (xmin + 127) & ~255
1243 // (xmax + 128) & ~255
1244 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
1245 (!CT::IsConservativeT::value))
1246 {
1247 origTriMask = triMask;
1248
1249 int cullCenterMask;
1250
1251 {
1252 simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127));
1253 xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255));
1254 simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128));
1255 xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255));
1256
1257 simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax);
1258
1259 simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127));
1260 ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255));
1261 simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128));
1262 ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255));
1263
1264 simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax);
1265
1266 vMaskV = _simd16_or_si(vMaskH, vMaskV);
1267 cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV));
1268 }
1269
1270 triMask &= ~cullCenterMask;
1271
1272 if (origTriMask ^ triMask)
1273 {
1274 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1275 }
1276 }
1277
1278 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1279 // Gather the AOS effective scissor rects based on the per-prim VP index.
1280 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1281 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
1282
1283 if (state.gsState.emitsViewportArrayIndex)
1284 {
1285 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1286 scisXmin, scisYmin, scisXmax, scisYmax);
1287 }
1288 else // broadcast fast path for non-VPAI case.
1289 {
1290 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1291 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1292 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1293 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1294 }
1295
1296 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
1297 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
1298 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
1299 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
1300
1301 if (CT::IsConservativeT::value)
1302 {
1303 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1304 // some area. Bump the xmax/ymax edges out
1305 simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax);
1306 bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom);
1307 simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax);
1308 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight);
1309 }
1310
1311 // Cull tris completely outside scissor
1312 {
1313 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
1314 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
1315 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
1316 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
1317 triMask = triMask & ~maskOutsideScissor;
1318 }
1319
1320 if (!triMask)
1321 {
1322 goto endBinTriangles;
1323 }
1324
1325 // Convert triangle bbox to macrotile units.
1326 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1327 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1328 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1329 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1330
1331 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
1332
1333 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
1334 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
1335 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
1336 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
1337
1338 // transpose verts needed for backend
1339 /// @todo modify BE to take non-transformed verts
1340 __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1341 __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1342 __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1343 __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1344
1345 vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
1346 vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
1347 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0));
1348 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0));
1349
1350 vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1));
1351 vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1));
1352 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1));
1353 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1));
1354
1355 // store render target array index
1356 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1357 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1358 {
1359 simd16vector vRtai[3];
1360 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
1361 simd16scalari vRtaii;
1362 vRtaii = _simd16_castps_si(vRtai[0].x);
1363 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1364 }
1365 else
1366 {
1367 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1368 }
1369
1370 endBinTriangles:
1371
1372
1373 // scan remaining valid triangles and bin each separately
1374 while (_BitScanForward(&triIndex, triMask))
1375 {
1376 uint32_t linkageCount = state.backendState.numAttributes;
1377 uint32_t numScalarAttribs = linkageCount * 4;
1378
1379 BE_WORK work;
1380 work.type = DRAW;
1381
1382 bool isDegenerate;
1383 if (CT::IsConservativeT::value)
1384 {
1385 // only rasterize valid edges if we have a degenerate primitive
1386 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1387 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1388 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1389
1390 // Degenerate triangles are required to be constant interpolated
1391 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1392 }
1393 else
1394 {
1395 isDegenerate = false;
1396 work.pfnWork = pfnWork;
1397 }
1398
1399 // Select attribute processor
1400 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1401 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1402
1403 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1404
1405 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1406 desc.triFlags.primID = pPrimID[triIndex];
1407 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1408 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1409
1410 auto pArena = pDC->pArena;
1411 SWR_ASSERT(pArena != nullptr);
1412
1413 // store active attribs
1414 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1415 desc.pAttribs = pAttribs;
1416 desc.numAttribs = linkageCount;
1417 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1418
1419 // store triangle vertex data
1420 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1421
1422 {
1423 const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
1424 const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH
1425
1426 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
1427 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
1428 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
1429 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
1430 }
1431
1432 // store user clip distances
1433 if (rastState.clipDistanceMask)
1434 {
1435 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1436 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1437 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1438 }
1439
1440 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1441 {
1442 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1443 {
1444 #if KNOB_ENABLE_TOSS_POINTS
1445 if (!KNOB_TOSS_SETUP_TRIS)
1446 #endif
1447 {
1448 pTileMgr->enqueue(x, y, &work);
1449 }
1450 }
1451 }
1452
1453 triMask &= ~(1 << triIndex);
1454 }
1455
1456 AR_END(FEBinTriangles, 1);
1457 }
1458
1459 #endif
1460 struct FEBinTrianglesChooser
1461 {
1462 typedef PFN_PROCESS_PRIMS FuncType;
1463
1464 template <typename... ArgsB>
1465 static FuncType GetFunc()
1466 {
1467 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1468 }
1469 };
1470
1471 // Selector for correct templated BinTrinagles function
1472 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1473 {
1474 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1475 }
1476
1477 #if USE_SIMD16_FRONTEND
1478 struct FEBinTrianglesChooser_simd16
1479 {
1480 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1481
1482 template <typename... ArgsB>
1483 static FuncType GetFunc()
1484 {
1485 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1486 }
1487 };
1488
1489 // Selector for correct templated BinTrinagles function
1490 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1491 {
1492 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1493 }
1494
1495 #endif
1496
1497 //////////////////////////////////////////////////////////////////////////
1498 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1499 /// @param pDC - pointer to draw context.
1500 /// @param pa - The primitive assembly object.
1501 /// @param workerId - thread's worker id. Even thread has a unique id.
1502 /// @param tri - Contains point position data for SIMDs worth of points.
1503 /// @param primID - Primitive ID for each point.
1504 void BinPoints(
1505 DRAW_CONTEXT *pDC,
1506 PA_STATE& pa,
1507 uint32_t workerId,
1508 simdvector prim[3],
1509 uint32_t primMask,
1510 simdscalari primID,
1511 simdscalari viewportIdx)
1512 {
1513 SWR_CONTEXT *pContext = pDC->pContext;
1514
1515 AR_BEGIN(FEBinPoints, pDC->drawId);
1516
1517 simdvector& primVerts = prim[0];
1518
1519 const API_STATE& state = GetApiState(pDC);
1520 const SWR_FRONTEND_STATE& feState = state.frontendState;
1521 const SWR_GS_STATE& gsState = state.gsState;
1522 const SWR_RASTSTATE& rastState = state.rastState;
1523 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1524
1525 // Select attribute processor
1526 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1527 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1528
1529 if (!feState.vpTransformDisable)
1530 {
1531 // perspective divide
1532 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1533 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1534 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1535 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1536
1537 // viewport transform to screen coords
1538 if (state.gsState.emitsViewportArrayIndex)
1539 {
1540 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
1541 }
1542 else
1543 {
1544 viewportTransform<1>(&primVerts, state.vpMatrices);
1545 }
1546 }
1547
1548 // adjust for pixel center location
1549 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1550 primVerts.x = _simd_add_ps(primVerts.x, offset);
1551 primVerts.y = _simd_add_ps(primVerts.y, offset);
1552
1553 // convert to fixed point
1554 simdscalari vXi, vYi;
1555 vXi = fpToFixedPointVertical(primVerts.x);
1556 vYi = fpToFixedPointVertical(primVerts.y);
1557
1558 if (CanUseSimplePoints(pDC))
1559 {
1560 // adjust for ymin-xmin rule
1561 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1562 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1563
1564 // cull points off the ymin-xmin edge of the viewport
1565 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1566 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1567
1568 // compute macro tile coordinates
1569 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1570 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1571
1572 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1573 _simd_store_si((simdscalari*)aMacroX, macroX);
1574 _simd_store_si((simdscalari*)aMacroY, macroY);
1575
1576 // compute raster tile coordinates
1577 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1578 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1579
1580 // compute raster tile relative x,y for coverage mask
1581 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1582 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1583
1584 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1585 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1586
1587 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1588 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1589 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1590 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1591
1592 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1593 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1594 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1595 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1596
1597 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1598 _simd_store_ps((float*)aZ, primVerts.z);
1599
1600 // store render target array index
1601 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1602 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1603 {
1604 simdvector vRtai;
1605 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
1606 simdscalari vRtaii = _simd_castps_si(vRtai.x);
1607 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1608 }
1609 else
1610 {
1611 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1612 }
1613
1614 uint32_t *pPrimID = (uint32_t *)&primID;
1615 DWORD primIndex = 0;
1616
1617 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1618
1619 // scan remaining valid triangles and bin each separately
1620 while (_BitScanForward(&primIndex, primMask))
1621 {
1622 uint32_t linkageCount = backendState.numAttributes;
1623 uint32_t numScalarAttribs = linkageCount * 4;
1624
1625 BE_WORK work;
1626 work.type = DRAW;
1627
1628 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1629
1630 // points are always front facing
1631 desc.triFlags.frontFacing = 1;
1632 desc.triFlags.primID = pPrimID[primIndex];
1633 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1634 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1635
1636 work.pfnWork = RasterizeSimplePoint;
1637
1638 auto pArena = pDC->pArena;
1639 SWR_ASSERT(pArena != nullptr);
1640
1641 // store attributes
1642 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1643 desc.pAttribs = pAttribs;
1644 desc.numAttribs = linkageCount;
1645
1646 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1647
1648 // store raster tile aligned x, y, perspective correct z
1649 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1650 desc.pTriBuffer = pTriBuffer;
1651 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1652 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1653 *pTriBuffer = aZ[primIndex];
1654
1655 uint32_t tX = aTileRelativeX[primIndex];
1656 uint32_t tY = aTileRelativeY[primIndex];
1657
1658 // pack the relative x,y into the coverageMask, the rasterizer will
1659 // generate the true coverage mask from it
1660 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1661
1662 // bin it
1663 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1664 #if KNOB_ENABLE_TOSS_POINTS
1665 if (!KNOB_TOSS_SETUP_TRIS)
1666 #endif
1667 {
1668 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1669 }
1670 primMask &= ~(1 << primIndex);
1671 }
1672 }
1673 else
1674 {
1675 // non simple points need to be potentially binned to multiple macro tiles
1676 simdscalar vPointSize;
1677 if (rastState.pointParam)
1678 {
1679 simdvector size[3];
1680 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
1681 vPointSize = size[0].x;
1682 }
1683 else
1684 {
1685 vPointSize = _simd_set1_ps(rastState.pointSize);
1686 }
1687
1688 // bloat point to bbox
1689 simdBBox bbox;
1690 bbox.xmin = bbox.xmax = vXi;
1691 bbox.ymin = bbox.ymax = vYi;
1692
1693 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
1694 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1695 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1696 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1697 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1698 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1699
1700 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1701 // Gather the AOS effective scissor rects based on the per-prim VP index.
1702 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1703 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1704 if (state.gsState.emitsViewportArrayIndex)
1705 {
1706 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1707 scisXmin, scisYmin, scisXmax, scisYmax);
1708 }
1709 else // broadcast fast path for non-VPAI case.
1710 {
1711 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1712 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1713 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1714 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1715 }
1716
1717 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1718 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1719 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1720 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1721
1722 // Cull bloated points completely outside scissor
1723 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1724 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1725 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1726 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1727 primMask = primMask & ~maskOutsideScissor;
1728
1729 // Convert bbox to macrotile units.
1730 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1731 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1732 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1733 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1734
1735 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1736 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1737 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1738 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1739 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1740
1741 // store render target array index
1742 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1743 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1744 {
1745 simdvector vRtai[2];
1746 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1747 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
1748 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1749 }
1750 else
1751 {
1752 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1753 }
1754
1755 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
1756 _simd_store_ps((float*)aPointSize, vPointSize);
1757
1758 uint32_t *pPrimID = (uint32_t *)&primID;
1759
1760 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
1761 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
1762 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
1763
1764 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
1765 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
1766 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
1767
1768 // scan remaining valid prims and bin each separately
1769 const SWR_BACKEND_STATE& backendState = state.backendState;
1770 DWORD primIndex;
1771 while (_BitScanForward(&primIndex, primMask))
1772 {
1773 uint32_t linkageCount = backendState.numAttributes;
1774 uint32_t numScalarAttribs = linkageCount * 4;
1775
1776 BE_WORK work;
1777 work.type = DRAW;
1778
1779 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1780
1781 desc.triFlags.frontFacing = 1;
1782 desc.triFlags.primID = pPrimID[primIndex];
1783 desc.triFlags.pointSize = aPointSize[primIndex];
1784 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1785 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1786
1787 work.pfnWork = RasterizeTriPoint;
1788
1789 auto pArena = pDC->pArena;
1790 SWR_ASSERT(pArena != nullptr);
1791
1792 // store active attribs
1793 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1794 desc.numAttribs = linkageCount;
1795 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1796
1797 // store point vertex data
1798 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1799 desc.pTriBuffer = pTriBuffer;
1800 *pTriBuffer++ = aPrimVertsX[primIndex];
1801 *pTriBuffer++ = aPrimVertsY[primIndex];
1802 *pTriBuffer = aPrimVertsZ[primIndex];
1803
1804 // store user clip distances
1805 if (rastState.clipDistanceMask)
1806 {
1807 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1808 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1809 float dists[8];
1810 float one = 1.0f;
1811 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
1812 for (uint32_t i = 0; i < numClipDist; i++) {
1813 desc.pUserClipBuffer[3*i + 0] = 0.0f;
1814 desc.pUserClipBuffer[3*i + 1] = 0.0f;
1815 desc.pUserClipBuffer[3*i + 2] = dists[i];
1816 }
1817 }
1818
1819 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1820 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1821 {
1822 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1823 {
1824 #if KNOB_ENABLE_TOSS_POINTS
1825 if (!KNOB_TOSS_SETUP_TRIS)
1826 #endif
1827 {
1828 pTileMgr->enqueue(x, y, &work);
1829 }
1830 }
1831 }
1832
1833 primMask &= ~(1 << primIndex);
1834 }
1835 }
1836
1837 AR_END(FEBinPoints, 1);
1838 }
1839
1840 #if USE_SIMD16_FRONTEND
1841 void SIMDAPI BinPoints_simd16(
1842 DRAW_CONTEXT *pDC,
1843 PA_STATE& pa,
1844 uint32_t workerId,
1845 simd16vector prim[3],
1846 uint32_t primMask,
1847 simd16scalari primID,
1848 simd16scalari viewportIdx)
1849 {
1850 SWR_CONTEXT *pContext = pDC->pContext;
1851
1852 AR_BEGIN(FEBinPoints, pDC->drawId);
1853
1854 simd16vector& primVerts = prim[0];
1855
1856 const API_STATE& state = GetApiState(pDC);
1857 const SWR_FRONTEND_STATE& feState = state.frontendState;
1858 const SWR_GS_STATE& gsState = state.gsState;
1859 const SWR_RASTSTATE& rastState = state.rastState;
1860 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1861
1862 // Select attribute processor
1863 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1864 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1865
1866 if (!feState.vpTransformDisable)
1867 {
1868 // perspective divide
1869 simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w);
1870
1871 primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0);
1872 primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0);
1873 primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
1874
1875 // viewport transform to screen coords
1876 if (state.gsState.emitsViewportArrayIndex)
1877 {
1878 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
1879 }
1880 else
1881 {
1882 viewportTransform<1>(&primVerts, state.vpMatrices);
1883 }
1884 }
1885
1886 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
1887
1888 primVerts.x = _simd16_add_ps(primVerts.x, offset);
1889 primVerts.y = _simd16_add_ps(primVerts.y, offset);
1890
1891 // convert to fixed point
1892 simd16scalari vXi, vYi;
1893
1894 vXi = fpToFixedPointVertical(primVerts.x);
1895 vYi = fpToFixedPointVertical(primVerts.y);
1896
1897 if (CanUseSimplePoints(pDC))
1898 {
1899 // adjust for ymin-xmin rule
1900 vXi = _simd16_sub_epi32(vXi, _simd16_set1_epi32(1));
1901 vYi = _simd16_sub_epi32(vYi, _simd16_set1_epi32(1));
1902
1903 // cull points off the ymin-xmin edge of the viewport
1904 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi));
1905 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi));
1906
1907 // compute macro tile coordinates
1908 simd16scalari macroX = _simd16_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1909 simd16scalari macroY = _simd16_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1910
1911 OSALIGNSIMD16(uint32_t) aMacroX[KNOB_SIMD16_WIDTH], aMacroY[KNOB_SIMD16_WIDTH];
1912
1913 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroX), macroX);
1914 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroY), macroY);
1915
1916 // compute raster tile coordinates
1917 simd16scalari rasterX = _simd16_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1918 simd16scalari rasterY = _simd16_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1919
1920 // compute raster tile relative x,y for coverage mask
1921 simd16scalari tileAlignedX = _simd16_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1922 simd16scalari tileAlignedY = _simd16_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1923
1924 simd16scalari tileRelativeX = _simd16_sub_epi32(_simd16_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1925 simd16scalari tileRelativeY = _simd16_sub_epi32(_simd16_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1926
1927 OSALIGNSIMD16(uint32_t) aTileRelativeX[KNOB_SIMD16_WIDTH];
1928 OSALIGNSIMD16(uint32_t) aTileRelativeY[KNOB_SIMD16_WIDTH];
1929
1930 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeX), tileRelativeX);
1931 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeY), tileRelativeY);
1932
1933 OSALIGNSIMD16(uint32_t) aTileAlignedX[KNOB_SIMD16_WIDTH];
1934 OSALIGNSIMD16(uint32_t) aTileAlignedY[KNOB_SIMD16_WIDTH];
1935
1936 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedX), tileAlignedX);
1937 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedY), tileAlignedY);
1938
1939 OSALIGNSIMD16(float) aZ[KNOB_SIMD16_WIDTH];
1940 _simd16_store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1941
1942 // store render target array index
1943 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1944 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1945 {
1946 simd16vector vRtai;
1947 pa.Assemble_simd16(VERTEX_RTAI_SLOT, &vRtai);
1948 simd16scalari vRtaii = _simd16_castps_si(vRtai.x);
1949 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1950 }
1951 else
1952 {
1953 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1954 }
1955
1956 uint32_t *pPrimID = (uint32_t *)&primID;
1957 DWORD primIndex = 0;
1958
1959 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1960
1961 // scan remaining valid triangles and bin each separately
1962 while (_BitScanForward(&primIndex, primMask))
1963 {
1964 uint32_t linkageCount = backendState.numAttributes;
1965 uint32_t numScalarAttribs = linkageCount * 4;
1966
1967 BE_WORK work;
1968 work.type = DRAW;
1969
1970 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1971
1972 // points are always front facing
1973 desc.triFlags.frontFacing = 1;
1974 desc.triFlags.primID = pPrimID[primIndex];
1975 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1976 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1977
1978 work.pfnWork = RasterizeSimplePoint;
1979
1980 auto pArena = pDC->pArena;
1981 SWR_ASSERT(pArena != nullptr);
1982
1983 // store attributes
1984 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1985 desc.pAttribs = pAttribs;
1986 desc.numAttribs = linkageCount;
1987
1988 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1989
1990 // store raster tile aligned x, y, perspective correct z
1991 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1992 desc.pTriBuffer = pTriBuffer;
1993 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1994 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1995 *pTriBuffer = aZ[primIndex];
1996
1997 uint32_t tX = aTileRelativeX[primIndex];
1998 uint32_t tY = aTileRelativeY[primIndex];
1999
2000 // pack the relative x,y into the coverageMask, the rasterizer will
2001 // generate the true coverage mask from it
2002 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2003
2004 // bin it
2005 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2006 #if KNOB_ENABLE_TOSS_POINTS
2007 if (!KNOB_TOSS_SETUP_TRIS)
2008 #endif
2009 {
2010 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2011 }
2012
2013 primMask &= ~(1 << primIndex);
2014 }
2015 }
2016 else
2017 {
2018 // non simple points need to be potentially binned to multiple macro tiles
2019 simd16scalar vPointSize;
2020
2021 if (rastState.pointParam)
2022 {
2023 simd16vector size[3];
2024 pa.Assemble_simd16(VERTEX_POINT_SIZE_SLOT, size);
2025 vPointSize = size[0].x;
2026 }
2027 else
2028 {
2029 vPointSize = _simd16_set1_ps(rastState.pointSize);
2030 }
2031
2032 // bloat point to bbox
2033 simd16BBox bbox;
2034
2035 bbox.xmin = bbox.xmax = vXi;
2036 bbox.ymin = bbox.ymax = vYi;
2037
2038 simd16scalar vHalfWidth = _simd16_mul_ps(vPointSize, _simd16_set1_ps(0.5f));
2039 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2040
2041 bbox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
2042 bbox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
2043 bbox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
2044 bbox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
2045
2046 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2047 // Gather the AOS effective scissor rects based on the per-prim VP index.
2048 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2049 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
2050 if (state.gsState.emitsViewportArrayIndex)
2051 {
2052 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2053 scisXmin, scisYmin, scisXmax, scisYmax);
2054 }
2055 else // broadcast fast path for non-VPAI case.
2056 {
2057 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2058 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2059 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2060 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2061 }
2062
2063 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
2064 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
2065 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
2066 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
2067
2068 // Cull bloated points completely outside scissor
2069 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
2070 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
2071 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
2072 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
2073 primMask = primMask & ~maskOutsideScissor;
2074
2075 // Convert bbox to macrotile units.
2076 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2077 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2078 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2079 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2080
2081 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
2082
2083 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
2084 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
2085 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
2086 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
2087
2088 // store render target array index
2089 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
2090 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2091 {
2092 simd16vector vRtai[2];
2093 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
2094 simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
2095 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
2096 }
2097 else
2098 {
2099 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
2100 }
2101
2102 OSALIGNSIMD16(float) aPointSize[KNOB_SIMD16_WIDTH];
2103 _simd16_store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
2104
2105 uint32_t *pPrimID = (uint32_t *)&primID;
2106
2107 OSALIGNSIMD16(float) aPrimVertsX[KNOB_SIMD16_WIDTH];
2108 OSALIGNSIMD16(float) aPrimVertsY[KNOB_SIMD16_WIDTH];
2109 OSALIGNSIMD16(float) aPrimVertsZ[KNOB_SIMD16_WIDTH];
2110
2111 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
2112 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
2113 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
2114
2115 // scan remaining valid prims and bin each separately
2116 const SWR_BACKEND_STATE& backendState = state.backendState;
2117 DWORD primIndex;
2118 while (_BitScanForward(&primIndex, primMask))
2119 {
2120 uint32_t linkageCount = backendState.numAttributes;
2121 uint32_t numScalarAttribs = linkageCount * 4;
2122
2123 BE_WORK work;
2124 work.type = DRAW;
2125
2126 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2127
2128 desc.triFlags.frontFacing = 1;
2129 desc.triFlags.primID = pPrimID[primIndex];
2130 desc.triFlags.pointSize = aPointSize[primIndex];
2131 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2132 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2133
2134 work.pfnWork = RasterizeTriPoint;
2135
2136 auto pArena = pDC->pArena;
2137 SWR_ASSERT(pArena != nullptr);
2138
2139 // store active attribs
2140 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2141 desc.numAttribs = linkageCount;
2142 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2143
2144 // store point vertex data
2145 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2146 desc.pTriBuffer = pTriBuffer;
2147 *pTriBuffer++ = aPrimVertsX[primIndex];
2148 *pTriBuffer++ = aPrimVertsY[primIndex];
2149 *pTriBuffer = aPrimVertsZ[primIndex];
2150
2151 // store user clip distances
2152 if (rastState.clipDistanceMask)
2153 {
2154 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2155 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2156 float dists[8];
2157 float one = 1.0f;
2158 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
2159 for (uint32_t i = 0; i < numClipDist; i++) {
2160 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
2161 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
2162 desc.pUserClipBuffer[3 * i + 2] = dists[i];
2163 }
2164 }
2165
2166 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2167 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2168 {
2169 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2170 {
2171 #if KNOB_ENABLE_TOSS_POINTS
2172 if (!KNOB_TOSS_SETUP_TRIS)
2173 #endif
2174 {
2175 pTileMgr->enqueue(x, y, &work);
2176 }
2177 }
2178 }
2179
2180 primMask &= ~(1 << primIndex);
2181 }
2182 }
2183
2184 AR_END(FEBinPoints, 1);
2185 }
2186
2187 #endif
2188 //////////////////////////////////////////////////////////////////////////
2189 /// @brief Bin SIMD lines to the backend.
2190 /// @param pDC - pointer to draw context.
2191 /// @param pa - The primitive assembly object.
2192 /// @param workerId - thread's worker id. Even thread has a unique id.
2193 /// @param tri - Contains line position data for SIMDs worth of points.
2194 /// @param primID - Primitive ID for each line.
2195 /// @param viewportIdx - Viewport Array Index for each line.
2196 void BinPostSetupLines(
2197 DRAW_CONTEXT *pDC,
2198 PA_STATE& pa,
2199 uint32_t workerId,
2200 simdvector prim[],
2201 simdscalar recipW[],
2202 uint32_t primMask,
2203 simdscalari primID,
2204 simdscalari viewportIdx)
2205 {
2206 SWR_CONTEXT *pContext = pDC->pContext;
2207
2208 AR_BEGIN(FEBinLines, pDC->drawId);
2209
2210 const API_STATE& state = GetApiState(pDC);
2211 const SWR_RASTSTATE& rastState = state.rastState;
2212 const SWR_GS_STATE& gsState = state.gsState;
2213
2214 // Select attribute processor
2215 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2216 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2217
2218 simdscalar& vRecipW0 = recipW[0];
2219 simdscalar& vRecipW1 = recipW[1];
2220
2221 // convert to fixed point
2222 simdscalari vXi[2], vYi[2];
2223 vXi[0] = fpToFixedPointVertical(prim[0].x);
2224 vYi[0] = fpToFixedPointVertical(prim[0].y);
2225 vXi[1] = fpToFixedPointVertical(prim[1].x);
2226 vYi[1] = fpToFixedPointVertical(prim[1].y);
2227
2228 // compute x-major vs y-major mask
2229 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2230 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2231 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2232 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2233
2234 // cull zero-length lines
2235 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2236 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2237
2238 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2239
2240 uint32_t *pPrimID = (uint32_t *)&primID;
2241 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2242
2243 simdscalar vUnused = _simd_setzero_ps();
2244
2245 // Calc bounding box of lines
2246 simdBBox bbox;
2247 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2248 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2249 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2250 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2251
2252 // bloat bbox by line width along minor axis
2253 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2254 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2255 simdBBox bloatBox;
2256 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2257 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2258 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2259 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2260
2261 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2262 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2263 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2264 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2265
2266 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2267 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2268 if (state.gsState.emitsViewportArrayIndex)
2269 {
2270 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2271 scisXmin, scisYmin, scisXmax, scisYmax);
2272 }
2273 else // broadcast fast path for non-VPAI case.
2274 {
2275 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2276 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2277 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2278 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2279 }
2280
2281 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2282 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2283 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2284 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2285
2286 // Cull prims completely outside scissor
2287 {
2288 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2289 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2290 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2291 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2292 primMask = primMask & ~maskOutsideScissor;
2293 }
2294
2295 if (!primMask)
2296 {
2297 goto endBinLines;
2298 }
2299
2300 // Convert triangle bbox to macrotile units.
2301 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2302 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2303 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2304 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2305
2306 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2307 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2308 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2309 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2310 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2311
2312 // transpose verts needed for backend
2313 /// @todo modify BE to take non-transformed verts
2314 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2315 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2316 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2317 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2318 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2319
2320 // store render target array index
2321 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2322 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2323 {
2324 simdvector vRtai[2];
2325 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2326 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2327 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2328 }
2329 else
2330 {
2331 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2332 }
2333
2334 // scan remaining valid prims and bin each separately
2335 DWORD primIndex;
2336 while (_BitScanForward(&primIndex, primMask))
2337 {
2338 uint32_t linkageCount = state.backendState.numAttributes;
2339 uint32_t numScalarAttribs = linkageCount * 4;
2340
2341 BE_WORK work;
2342 work.type = DRAW;
2343
2344 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2345
2346 desc.triFlags.frontFacing = 1;
2347 desc.triFlags.primID = pPrimID[primIndex];
2348 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2349 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2350 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2351
2352 work.pfnWork = RasterizeLine;
2353
2354 auto pArena = pDC->pArena;
2355 SWR_ASSERT(pArena != nullptr);
2356
2357 // store active attribs
2358 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2359 desc.numAttribs = linkageCount;
2360 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2361
2362 // store line vertex data
2363 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2364 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2365 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2366 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2367 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2368
2369 // store user clip distances
2370 if (rastState.clipDistanceMask)
2371 {
2372 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2373 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2374 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2375 }
2376
2377 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2378 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2379 {
2380 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2381 {
2382 #if KNOB_ENABLE_TOSS_POINTS
2383 if (!KNOB_TOSS_SETUP_TRIS)
2384 #endif
2385 {
2386 pTileMgr->enqueue(x, y, &work);
2387 }
2388 }
2389 }
2390
2391 primMask &= ~(1 << primIndex);
2392 }
2393
2394 endBinLines:
2395
2396 AR_END(FEBinLines, 1);
2397 }
2398
2399 #if USE_SIMD16_FRONTEND
2400 void BinPostSetupLines_simd16(
2401 DRAW_CONTEXT *pDC,
2402 PA_STATE& pa,
2403 uint32_t workerId,
2404 simd16vector prim[],
2405 simd16scalar recipW[],
2406 uint32_t primMask,
2407 simd16scalari primID,
2408 simd16scalari viewportIdx)
2409 {
2410 SWR_CONTEXT *pContext = pDC->pContext;
2411
2412 AR_BEGIN(FEBinLines, pDC->drawId);
2413
2414 const API_STATE& state = GetApiState(pDC);
2415 const SWR_RASTSTATE& rastState = state.rastState;
2416 const SWR_FRONTEND_STATE& feState = state.frontendState;
2417 const SWR_GS_STATE& gsState = state.gsState;
2418
2419 // Select attribute processor
2420 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2421 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2422
2423 simd16scalar& vRecipW0 = recipW[0];
2424 simd16scalar& vRecipW1 = recipW[1];
2425
2426 // convert to fixed point
2427 simd16scalari vXi[2], vYi[2];
2428
2429 vXi[0] = fpToFixedPointVertical(prim[0].x);
2430 vYi[0] = fpToFixedPointVertical(prim[0].y);
2431 vXi[1] = fpToFixedPointVertical(prim[1].x);
2432 vYi[1] = fpToFixedPointVertical(prim[1].y);
2433
2434 // compute x-major vs y-major mask
2435 simd16scalari xLength = _simd16_abs_epi32(_simd16_sub_epi32(vXi[0], vXi[1]));
2436 simd16scalari yLength = _simd16_abs_epi32(_simd16_sub_epi32(vYi[0], vYi[1]));
2437 simd16scalar vYmajorMask = _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength, xLength));
2438 uint32_t yMajorMask = _simd16_movemask_ps(vYmajorMask);
2439
2440 // cull zero-length lines
2441 simd16scalari vZeroLengthMask = _simd16_cmpeq_epi32(xLength, _simd16_setzero_si());
2442 vZeroLengthMask = _simd16_and_si(vZeroLengthMask, _simd16_cmpeq_epi32(yLength, _simd16_setzero_si()));
2443
2444 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask));
2445
2446 uint32_t *pPrimID = (uint32_t *)&primID;
2447 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2448
2449 // Calc bounding box of lines
2450 simd16BBox bbox;
2451 bbox.xmin = _simd16_min_epi32(vXi[0], vXi[1]);
2452 bbox.xmax = _simd16_max_epi32(vXi[0], vXi[1]);
2453 bbox.ymin = _simd16_min_epi32(vYi[0], vYi[1]);
2454 bbox.ymax = _simd16_max_epi32(vYi[0], vYi[1]);
2455
2456 // bloat bbox by line width along minor axis
2457 simd16scalar vHalfWidth = _simd16_set1_ps(rastState.lineWidth / 2.0f);
2458 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2459
2460 simd16BBox bloatBox;
2461
2462 bloatBox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
2463 bloatBox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
2464 bloatBox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
2465 bloatBox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
2466
2467 bbox.xmin = _simd16_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2468 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2469 bbox.ymin = _simd16_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2470 bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2471
2472 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2473 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
2474
2475 if (state.gsState.emitsViewportArrayIndex)
2476 {
2477 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2478 scisXmin, scisYmin, scisXmax, scisYmax);
2479 }
2480 else // broadcast fast path for non-VPAI case.
2481 {
2482 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2483 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2484 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2485 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2486 }
2487
2488 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
2489 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
2490 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
2491 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
2492
2493 // Cull prims completely outside scissor
2494 {
2495 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
2496 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
2497 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
2498 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
2499 primMask = primMask & ~maskOutsideScissor;
2500 }
2501
2502 const simdscalar unused = _simd_setzero_ps();
2503
2504 if (!primMask)
2505 {
2506 goto endBinLines;
2507 }
2508
2509 // Convert triangle bbox to macrotile units.
2510 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2511 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2512 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2513 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2514
2515 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
2516
2517 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
2518 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
2519 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
2520 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
2521
2522 // transpose verts needed for backend
2523 /// @todo modify BE to take non-transformed verts
2524 __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2525 __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2526 __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2527 __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2528
2529 vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
2530 vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
2531 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused);
2532 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), unused);
2533
2534 vTranspose3x8(vHorizX[1], _simd16_extract_ps(prim[0].x, 1), _simd16_extract_ps(prim[1].x, 1), unused);
2535 vTranspose3x8(vHorizY[1], _simd16_extract_ps(prim[0].y, 1), _simd16_extract_ps(prim[1].y, 1), unused);
2536 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(prim[0].z, 1), _simd16_extract_ps(prim[1].z, 1), unused);
2537 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), unused);
2538
2539 // store render target array index
2540 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
2541 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2542 {
2543 simd16vector vRtai[2];
2544 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
2545 simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
2546 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
2547 }
2548 else
2549 {
2550 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
2551 }
2552
2553 // scan remaining valid prims and bin each separately
2554 DWORD primIndex;
2555 while (_BitScanForward(&primIndex, primMask))
2556 {
2557 uint32_t linkageCount = state.backendState.numAttributes;
2558 uint32_t numScalarAttribs = linkageCount * 4;
2559
2560 BE_WORK work;
2561 work.type = DRAW;
2562
2563 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2564
2565 desc.triFlags.frontFacing = 1;
2566 desc.triFlags.primID = pPrimID[primIndex];
2567 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2568 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2569 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2570
2571 work.pfnWork = RasterizeLine;
2572
2573 auto pArena = pDC->pArena;
2574 SWR_ASSERT(pArena != nullptr);
2575
2576 // store active attribs
2577 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2578 desc.numAttribs = linkageCount;
2579 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2580
2581 // store line vertex data
2582 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2583
2584 {
2585 const uint32_t i = primIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
2586 const uint32_t j = primIndex & 7; // triIndex % KNOB_SIMD_WIDTH
2587
2588 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
2589 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
2590 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
2591 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
2592 }
2593
2594 // store user clip distances
2595 if (rastState.clipDistanceMask)
2596 {
2597 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2598 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2599 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2600 }
2601
2602 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2603 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2604 {
2605 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2606 {
2607 #if KNOB_ENABLE_TOSS_POINTS
2608 if (!KNOB_TOSS_SETUP_TRIS)
2609 #endif
2610 {
2611 pTileMgr->enqueue(x, y, &work);
2612 }
2613 }
2614 }
2615
2616 primMask &= ~(1 << primIndex);
2617 }
2618
2619 endBinLines:
2620
2621 AR_END(FEBinLines, 1);
2622 }
2623
2624 #endif
2625 //////////////////////////////////////////////////////////////////////////
2626 /// @brief Bin SIMD lines to the backend.
2627 /// @param pDC - pointer to draw context.
2628 /// @param pa - The primitive assembly object.
2629 /// @param workerId - thread's worker id. Even thread has a unique id.
2630 /// @param tri - Contains line position data for SIMDs worth of points.
2631 /// @param primID - Primitive ID for each line.
2632 /// @param viewportIdx - Viewport Array Index for each line.
2633 void BinLines(
2634 DRAW_CONTEXT *pDC,
2635 PA_STATE& pa,
2636 uint32_t workerId,
2637 simdvector prim[],
2638 uint32_t primMask,
2639 simdscalari primID,
2640 simdscalari viewportIdx)
2641 {
2642 const API_STATE& state = GetApiState(pDC);
2643 const SWR_RASTSTATE& rastState = state.rastState;
2644 const SWR_FRONTEND_STATE& feState = state.frontendState;
2645
2646 simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
2647
2648 if (!feState.vpTransformDisable)
2649 {
2650 // perspective divide
2651 vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2652 vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2653
2654 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
2655 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
2656
2657 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
2658 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
2659
2660 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
2661 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
2662
2663 // viewport transform to screen coords
2664 if (state.gsState.emitsViewportArrayIndex)
2665 {
2666 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2667 }
2668 else
2669 {
2670 viewportTransform<2>(prim, state.vpMatrices);
2671 }
2672 }
2673
2674 // adjust for pixel center location
2675 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2676 prim[0].x = _simd_add_ps(prim[0].x, offset);
2677 prim[0].y = _simd_add_ps(prim[0].y, offset);
2678
2679 prim[1].x = _simd_add_ps(prim[1].x, offset);
2680 prim[1].y = _simd_add_ps(prim[1].y, offset);
2681
2682 BinPostSetupLines(
2683 pDC,
2684 pa,
2685 workerId,
2686 prim,
2687 vRecipW,
2688 primMask,
2689 primID,
2690 viewportIdx);
2691 }
2692
2693 #if USE_SIMD16_FRONTEND
2694 void SIMDAPI BinLines_simd16(
2695 DRAW_CONTEXT *pDC,
2696 PA_STATE& pa,
2697 uint32_t workerId,
2698 simd16vector prim[3],
2699 uint32_t primMask,
2700 simd16scalari primID,
2701 simd16scalari viewportIdx)
2702 {
2703 SWR_CONTEXT *pContext = pDC->pContext;
2704
2705 const API_STATE& state = GetApiState(pDC);
2706 const SWR_RASTSTATE& rastState = state.rastState;
2707 const SWR_FRONTEND_STATE& feState = state.frontendState;
2708 const SWR_GS_STATE& gsState = state.gsState;
2709
2710 // Select attribute processor
2711 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2712 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2713
2714 simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) };
2715
2716 if (!feState.vpTransformDisable)
2717 {
2718 // perspective divide
2719 vRecipW[0] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[0].w);
2720 vRecipW[1] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[1].w);
2721
2722 prim[0].v[0] = _simd16_mul_ps(prim[0].v[0], vRecipW[0]);
2723 prim[1].v[0] = _simd16_mul_ps(prim[1].v[0], vRecipW[1]);
2724
2725 prim[0].v[1] = _simd16_mul_ps(prim[0].v[1], vRecipW[0]);
2726 prim[1].v[1] = _simd16_mul_ps(prim[1].v[1], vRecipW[1]);
2727
2728 prim[0].v[2] = _simd16_mul_ps(prim[0].v[2], vRecipW[0]);
2729 prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]);
2730
2731 // viewport transform to screen coords
2732 if (state.gsState.emitsViewportArrayIndex)
2733 {
2734 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2735 }
2736 else
2737 {
2738 viewportTransform<2>(prim, state.vpMatrices);
2739 }
2740 }
2741
2742 // adjust for pixel center location
2743 simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
2744
2745 prim[0].x = _simd16_add_ps(prim[0].x, offset);
2746 prim[0].y = _simd16_add_ps(prim[0].y, offset);
2747
2748 prim[1].x = _simd16_add_ps(prim[1].x, offset);
2749 prim[1].y = _simd16_add_ps(prim[1].y, offset);
2750
2751 BinPostSetupLines_simd16(
2752 pDC,
2753 pa,
2754 workerId,
2755 prim,
2756 vRecipW,
2757 primMask,
2758 primID,
2759 viewportIdx);
2760 }
2761
2762 #endif