swr/rast: SIMD16 FE - fix conservative rasterization
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file binner.cpp
24 *
25 * @brief Implementation for the macrotile binner
26 *
27 ******************************************************************************/
28
29 #include "context.h"
30 #include "frontend.h"
31 #include "conservativeRast.h"
32 #include "pa.h"
33 #include "rasterizer.h"
34 #include "rdtsc_core.h"
35 #include "tilemgr.h"
36
37 // Function Prototype
38 void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
39 void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
40
41 #if USE_SIMD16_FRONTEND
42 void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
43 void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
44 #endif
45
46 //////////////////////////////////////////////////////////////////////////
47 /// @brief Offsets added to post-viewport vertex positions based on
48 /// raster state.
49 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
50 {
51 _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
52 _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
53 };
54
55 #if USE_SIMD16_FRONTEND
56 static const simd16scalar g_pixelOffsets_simd16[SWR_PIXEL_LOCATION_UL + 1] =
57 {
58 _simd16_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
59 _simd16_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
60 };
61
62 #endif
63 //////////////////////////////////////////////////////////////////////////
64 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
65 /// Point precision from FP32.
66 template <typename PT = FixedPointTraits<Fixed_16_8>>
67 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
68 {
69 simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
70 return _simd_cvtps_epi32(vFixed);
71 }
72
73 #if USE_SIMD16_FRONTEND
74 template <typename PT = FixedPointTraits<Fixed_16_8>>
75 INLINE simd16scalari fpToFixedPointVertical(const simd16scalar vIn)
76 {
77 simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value));
78 return _simd16_cvtps_epi32(vFixed);
79 }
80
81 #endif
82 //////////////////////////////////////////////////////////////////////////
83 /// @brief Helper function to set the X,Y coords of a triangle to the
84 /// requested Fixed Point precision from FP32.
85 /// @param tri: simdvector[3] of FP triangle verts
86 /// @param vXi: fixed point X coords of tri verts
87 /// @param vYi: fixed point Y coords of tri verts
88 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3])
89 {
90 vXi[0] = fpToFixedPointVertical(tri[0].x);
91 vYi[0] = fpToFixedPointVertical(tri[0].y);
92 vXi[1] = fpToFixedPointVertical(tri[1].x);
93 vYi[1] = fpToFixedPointVertical(tri[1].y);
94 vXi[2] = fpToFixedPointVertical(tri[2].x);
95 vYi[2] = fpToFixedPointVertical(tri[2].y);
96 }
97
98 #if USE_SIMD16_FRONTEND
99 INLINE static void FPToFixedPoint(const simd16vector * const tri, simd16scalari(&vXi)[3], simd16scalari(&vYi)[3])
100 {
101 vXi[0] = fpToFixedPointVertical(tri[0].x);
102 vYi[0] = fpToFixedPointVertical(tri[0].y);
103 vXi[1] = fpToFixedPointVertical(tri[1].x);
104 vYi[1] = fpToFixedPointVertical(tri[1].y);
105 vXi[2] = fpToFixedPointVertical(tri[2].x);
106 vYi[2] = fpToFixedPointVertical(tri[2].y);
107 }
108
109 #endif
110 //////////////////////////////////////////////////////////////////////////
111 /// @brief Calculate bounding box for current triangle
112 /// @tparam CT: ConservativeRastFETraits type
113 /// @param vX: fixed point X position for triangle verts
114 /// @param vY: fixed point Y position for triangle verts
115 /// @param bbox: fixed point bbox
116 /// *Note*: expects vX, vY to be in the correct precision for the type
117 /// of rasterization. This avoids unnecessary FP->fixed conversions.
118 template <typename CT>
119 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
120 {
121 simdscalari vMinX = vX[0];
122 vMinX = _simd_min_epi32(vMinX, vX[1]);
123 vMinX = _simd_min_epi32(vMinX, vX[2]);
124
125 simdscalari vMaxX = vX[0];
126 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
127 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
128
129 simdscalari vMinY = vY[0];
130 vMinY = _simd_min_epi32(vMinY, vY[1]);
131 vMinY = _simd_min_epi32(vMinY, vY[2]);
132
133 simdscalari vMaxY = vY[0];
134 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
135 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
136
137 bbox.xmin = vMinX;
138 bbox.xmax = vMaxX;
139 bbox.ymin = vMinY;
140 bbox.ymax = vMaxY;
141 }
142
143 #if USE_SIMD16_FRONTEND
144 template <typename CT>
145 INLINE void calcBoundingBoxIntVertical(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox)
146 {
147 simd16scalari vMinX = vX[0];
148
149 vMinX = _simd16_min_epi32(vMinX, vX[1]);
150 vMinX = _simd16_min_epi32(vMinX, vX[2]);
151
152 simd16scalari vMaxX = vX[0];
153
154 vMaxX = _simd16_max_epi32(vMaxX, vX[1]);
155 vMaxX = _simd16_max_epi32(vMaxX, vX[2]);
156
157 simd16scalari vMinY = vY[0];
158
159 vMinY = _simd16_min_epi32(vMinY, vY[1]);
160 vMinY = _simd16_min_epi32(vMinY, vY[2]);
161
162 simd16scalari vMaxY = vY[0];
163
164 vMaxY = _simd16_max_epi32(vMaxY, vY[1]);
165 vMaxY = _simd16_max_epi32(vMaxY, vY[2]);
166
167 bbox.xmin = vMinX;
168 bbox.xmax = vMaxX;
169 bbox.ymin = vMinY;
170 bbox.ymax = vMaxY;
171 }
172
173 #endif
174 //////////////////////////////////////////////////////////////////////////
175 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
176 /// Offsets BBox for conservative rast
177 template <>
178 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox)
179 {
180 // FE conservative rast traits
181 typedef FEConservativeRastT CT;
182
183 simdscalari vMinX = vX[0];
184 vMinX = _simd_min_epi32(vMinX, vX[1]);
185 vMinX = _simd_min_epi32(vMinX, vX[2]);
186
187 simdscalari vMaxX = vX[0];
188 vMaxX = _simd_max_epi32(vMaxX, vX[1]);
189 vMaxX = _simd_max_epi32(vMaxX, vX[2]);
190
191 simdscalari vMinY = vY[0];
192 vMinY = _simd_min_epi32(vMinY, vY[1]);
193 vMinY = _simd_min_epi32(vMinY, vY[2]);
194
195 simdscalari vMaxY = vY[0];
196 vMaxY = _simd_max_epi32(vMaxY, vY[1]);
197 vMaxY = _simd_max_epi32(vMaxY, vY[2]);
198
199 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
200 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
201 bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
202 bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
203 bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
204 bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
205 }
206
207 #if USE_SIMD16_FRONTEND
208 template <>
209 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox)
210 {
211 // FE conservative rast traits
212 typedef FEConservativeRastT CT;
213
214 simd16scalari vMinX = vX[0];
215 vMinX = _simd16_min_epi32(vMinX, vX[1]);
216 vMinX = _simd16_min_epi32(vMinX, vX[2]);
217
218 simd16scalari vMaxX = vX[0];
219 vMaxX = _simd16_max_epi32(vMaxX, vX[1]);
220 vMaxX = _simd16_max_epi32(vMaxX, vX[2]);
221
222 simd16scalari vMinY = vY[0];
223 vMinY = _simd16_min_epi32(vMinY, vY[1]);
224 vMinY = _simd16_min_epi32(vMinY, vY[2]);
225
226 simd16scalari vMaxY = vY[0];
227 vMaxY = _simd16_max_epi32(vMaxY, vY[1]);
228 vMaxY = _simd16_max_epi32(vMaxY, vY[2]);
229
230 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
231 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
232 bbox.xmin = _simd16_sub_epi32(vMinX, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
233 bbox.xmax = _simd16_add_epi32(vMaxX, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
234 bbox.ymin = _simd16_sub_epi32(vMinY, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
235 bbox.ymax = _simd16_add_epi32(vMaxY, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value));
236 }
237
238 #endif
239 //////////////////////////////////////////////////////////////////////////
240 /// @brief Processes attributes for the backend based on linkage mask and
241 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
242 /// @param pDC - Draw context
243 /// @param pa - Primitive Assembly state
244 /// @param linkageMask - Specifies which VS outputs are routed to PS.
245 /// @param pLinkageMap - maps VS attribute slot to PS slot
246 /// @param triIndex - Triangle to process attributes for
247 /// @param pBuffer - Output result
248 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
249 INLINE void ProcessAttributes(
250 DRAW_CONTEXT *pDC,
251 PA_STATE&pa,
252 uint32_t triIndex,
253 uint32_t primId,
254 float *pBuffer)
255 {
256 static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
257 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
258 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
259 LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
260 const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
261 const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
262
263 static const float constTable[3][4] = {
264 { 0.0f, 0.0f, 0.0f, 0.0f },
265 { 0.0f, 0.0f, 0.0f, 1.0f },
266 { 1.0f, 1.0f, 1.0f, 1.0f }
267 };
268
269 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
270 {
271 uint32_t inputSlot;
272 if (IsSwizzledT::value)
273 {
274 SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
275 inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
276
277 }
278 else
279 {
280 inputSlot = VERTEX_ATTRIB_START_SLOT + i;
281 }
282
283 __m128 attrib[3]; // triangle attribs (always 4 wide)
284 float* pAttribStart = pBuffer;
285
286 if (HasConstantInterpT::value || IsDegenerate::value)
287 {
288 if (_bittest(&constantInterpMask, i))
289 {
290 uint32_t vid;
291 uint32_t adjustedTriIndex;
292 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
293 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
294 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
295 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
296 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
297
298 switch (topo) {
299 case TOP_QUAD_LIST:
300 adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
301 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
302 break;
303 case TOP_QUAD_STRIP:
304 adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
305 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
306 break;
307 case TOP_TRIANGLE_STRIP:
308 adjustedTriIndex = triIndex;
309 vid = (triIndex & 1)
310 ? tristripProvokingVertex[provokingVertex]
311 : provokingVertex;
312 break;
313 default:
314 adjustedTriIndex = triIndex;
315 vid = provokingVertex;
316 break;
317 }
318
319 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
320
321 for (uint32_t i = 0; i < NumVertsT::value; ++i)
322 {
323 _mm_store_ps(pBuffer, attrib[vid]);
324 pBuffer += 4;
325 }
326 }
327 else
328 {
329 pa.AssembleSingle(inputSlot, triIndex, attrib);
330
331 for (uint32_t i = 0; i < NumVertsT::value; ++i)
332 {
333 _mm_store_ps(pBuffer, attrib[i]);
334 pBuffer += 4;
335 }
336 }
337 }
338 else
339 {
340 pa.AssembleSingle(inputSlot, triIndex, attrib);
341
342 for (uint32_t i = 0; i < NumVertsT::value; ++i)
343 {
344 _mm_store_ps(pBuffer, attrib[i]);
345 pBuffer += 4;
346 }
347 }
348
349 // pad out the attrib buffer to 3 verts to ensure the triangle
350 // interpolation code in the pixel shader works correctly for the
351 // 3 topologies - point, line, tri. This effectively zeros out the
352 // effect of the missing vertices in the triangle interpolation.
353 for (uint32_t v = NumVertsT::value; v < 3; ++v)
354 {
355 _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
356 pBuffer += 4;
357 }
358
359 // check for constant source overrides
360 if (IsSwizzledT::value)
361 {
362 uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
363 if (mask)
364 {
365 DWORD comp;
366 while (_BitScanForward(&comp, mask))
367 {
368 mask &= ~(1 << comp);
369
370 float constantValue = 0.0f;
371 switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
372 {
373 case SWR_CONSTANT_SOURCE_CONST_0000:
374 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
375 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
376 constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
377 break;
378 case SWR_CONSTANT_SOURCE_PRIM_ID:
379 constantValue = *(float*)&primId;
380 break;
381 }
382
383 // apply constant value to all 3 vertices
384 for (uint32_t v = 0; v < 3; ++v)
385 {
386 pAttribStart[comp + v * 4] = constantValue;
387 }
388 }
389 }
390 }
391 }
392 }
393
394 //////////////////////////////////////////////////////////////////////////
395 /// @brief Gather scissor rect data based on per-prim viewport indices.
396 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
397 /// @param pViewportIndex - array of per-primitive vewport indexes.
398 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
399 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
400 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
401 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
402 //
403 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
404 template<size_t SimdWidth>
405 struct GatherScissors
406 {
407 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
408 simdscalari &scisXmin, simdscalari &scisYmin,
409 simdscalari &scisXmax, simdscalari &scisYmax)
410 {
411 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
412 }
413 };
414
415 template<>
416 struct GatherScissors<8>
417 {
418 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
419 simdscalari &scisXmin, simdscalari &scisYmin,
420 simdscalari &scisXmax, simdscalari &scisYmax)
421 {
422 scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
423 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
424 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
425 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
426 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
427 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
428 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
429 pScissorsInFixedPoint[pViewportIndex[7]].xmin);
430 scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
431 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
432 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
433 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
434 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
435 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
436 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
437 pScissorsInFixedPoint[pViewportIndex[7]].ymin);
438 scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
439 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
440 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
441 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
442 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
443 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
444 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
445 pScissorsInFixedPoint[pViewportIndex[7]].xmax);
446 scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
447 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
448 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
449 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
450 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
451 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
452 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
453 pScissorsInFixedPoint[pViewportIndex[7]].ymax);
454 }
455 };
456
457 #if USE_SIMD16_FRONTEND
458 template<size_t SimdWidth>
459 struct GatherScissors_simd16
460 {
461 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
462 simd16scalari &scisXmin, simd16scalari &scisYmin,
463 simd16scalari &scisXmax, simd16scalari &scisYmax)
464 {
465 SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
466 }
467 };
468
469 template<>
470 struct GatherScissors_simd16<16>
471 {
472 static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
473 simd16scalari &scisXmin, simd16scalari &scisYmin,
474 simd16scalari &scisXmax, simd16scalari &scisYmax)
475 {
476 scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
477 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
478 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
479 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
480 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
481 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
482 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
483 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
484 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
485 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
486 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
487 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
488 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
489 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
490 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
491 pScissorsInFixedPoint[pViewportIndex[15]].xmin);
492
493 scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
494 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
495 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
496 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
497 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
498 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
499 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
500 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
501 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
502 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
503 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
504 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
505 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
506 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
507 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
508 pScissorsInFixedPoint[pViewportIndex[15]].ymin);
509
510 scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
511 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
512 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
513 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
514 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
515 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
516 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
517 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
518 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
519 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
520 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
521 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
522 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
523 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
524 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
525 pScissorsInFixedPoint[pViewportIndex[15]].xmax);
526
527 scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
528 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
529 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
530 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
531 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
532 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
533 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
534 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
535 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
536 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
537 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
538 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
539 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
540 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
541 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
542 pScissorsInFixedPoint[pViewportIndex[15]].ymax);
543 }
544 };
545
546 #endif
547 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
548
549 struct ProcessAttributesChooser
550 {
551 typedef PFN_PROCESS_ATTRIBUTES FuncType;
552
553 template <typename... ArgsB>
554 static FuncType GetFunc()
555 {
556 return ProcessAttributes<ArgsB...>;
557 }
558 };
559
560 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
561 {
562 return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
563 }
564
565 //////////////////////////////////////////////////////////////////////////
566 /// @brief Processes enabled user clip distances. Loads the active clip
567 /// distances from the PA, sets up barycentric equations, and
568 /// stores the results to the output buffer
569 /// @param pa - Primitive Assembly state
570 /// @param primIndex - primitive index to process
571 /// @param clipDistMask - mask of enabled clip distances
572 /// @param pUserClipBuffer - buffer to store results
573 template<uint32_t NumVerts>
574 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float *pRecipW, float* pUserClipBuffer)
575 {
576 DWORD clipDist;
577 while (_BitScanForward(&clipDist, clipDistMask))
578 {
579 clipDistMask &= ~(1 << clipDist);
580 uint32_t clipSlot = clipDist >> 2;
581 uint32_t clipComp = clipDist & 0x3;
582 uint32_t clipAttribSlot = clipSlot == 0 ?
583 VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
584
585 __m128 primClipDist[3];
586 pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
587
588 float vertClipDist[NumVerts];
589 for (uint32_t e = 0; e < NumVerts; ++e)
590 {
591 OSALIGNSIMD(float) aVertClipDist[4];
592 _mm_store_ps(aVertClipDist, primClipDist[e]);
593 vertClipDist[e] = aVertClipDist[clipComp];
594 };
595
596 // setup plane equations for barycentric interpolation in the backend
597 float baryCoeff[NumVerts];
598 float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
599 for (uint32_t e = 0; e < NumVerts - 1; ++e)
600 {
601 baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
602 }
603 baryCoeff[NumVerts - 1] = last;
604
605 for (uint32_t e = 0; e < NumVerts; ++e)
606 {
607 *(pUserClipBuffer++) = baryCoeff[e];
608 }
609 }
610 }
611
612 //////////////////////////////////////////////////////////////////////////
613 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
614 /// culling, viewport transform, etc.
615 /// @param pDC - pointer to draw context.
616 /// @param pa - The primitive assembly object.
617 /// @param workerId - thread's worker id. Even thread has a unique id.
618 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
619 /// @param primID - Primitive ID for each triangle.
620 /// @param viewportIdx - viewport array index for each triangle.
621 /// @tparam CT - ConservativeRastFETraits
622 template <typename CT>
623 void BinTriangles(
624 DRAW_CONTEXT *pDC,
625 PA_STATE& pa,
626 uint32_t workerId,
627 simdvector tri[3],
628 uint32_t triMask,
629 simdscalari primID,
630 simdscalari viewportIdx)
631 {
632 SWR_CONTEXT *pContext = pDC->pContext;
633
634 AR_BEGIN(FEBinTriangles, pDC->drawId);
635
636 const API_STATE& state = GetApiState(pDC);
637 const SWR_RASTSTATE& rastState = state.rastState;
638 const SWR_FRONTEND_STATE& feState = state.frontendState;
639 const SWR_GS_STATE& gsState = state.gsState;
640 MacroTileMgr *pTileMgr = pDC->pTileMgr;
641
642 simdscalar vRecipW0 = _simd_set1_ps(1.0f);
643 simdscalar vRecipW1 = _simd_set1_ps(1.0f);
644 simdscalar vRecipW2 = _simd_set1_ps(1.0f);
645
646 if (feState.vpTransformDisable)
647 {
648 // RHW is passed in directly when VP transform is disabled
649 vRecipW0 = tri[0].v[3];
650 vRecipW1 = tri[1].v[3];
651 vRecipW2 = tri[2].v[3];
652 }
653 else
654 {
655 // Perspective divide
656 vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
657 vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
658 vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
659
660 tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
661 tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
662 tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
663
664 tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
665 tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
666 tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
667
668 tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
669 tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
670 tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
671
672 // Viewport transform to screen space coords
673 if (state.gsState.emitsViewportArrayIndex)
674 {
675 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
676 }
677 else
678 {
679 viewportTransform<3>(tri, state.vpMatrices);
680 }
681 }
682
683 // Adjust for pixel center location
684 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
685 tri[0].x = _simd_add_ps(tri[0].x, offset);
686 tri[0].y = _simd_add_ps(tri[0].y, offset);
687
688 tri[1].x = _simd_add_ps(tri[1].x, offset);
689 tri[1].y = _simd_add_ps(tri[1].y, offset);
690
691 tri[2].x = _simd_add_ps(tri[2].x, offset);
692 tri[2].y = _simd_add_ps(tri[2].y, offset);
693
694 simdscalari vXi[3], vYi[3];
695 // Set vXi, vYi to required fixed point precision
696 FPToFixedPoint(tri, vXi, vYi);
697
698 // triangle setup
699 simdscalari vAi[3], vBi[3];
700 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
701
702 // determinant
703 simdscalari vDet[2];
704 calcDeterminantIntVertical(vAi, vBi, vDet);
705
706 // cull zero area
707 int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
708 int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
709
710 int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
711
712 uint32_t origTriMask = triMask;
713 // don't cull degenerate triangles if we're conservatively rasterizing
714 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
715 {
716 triMask &= ~cullZeroAreaMask;
717 }
718
719 // determine front winding tris
720 // CW +det
721 // CCW det < 0;
722 // 0 area triangles are marked as backfacing regardless of winding order,
723 // which is required behavior for conservative rast and wireframe rendering
724 uint32_t frontWindingTris;
725 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
726 {
727 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
728 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
729 }
730 else
731 {
732 maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
733 maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
734 }
735 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
736
737 // cull
738 uint32_t cullTris;
739 switch ((SWR_CULLMODE)rastState.cullMode)
740 {
741 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
742 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
743 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
744 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
745 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
746 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
747 }
748
749 triMask &= ~cullTris;
750
751 if (origTriMask ^ triMask)
752 {
753 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
754 }
755
756 // Simple non-conformant wireframe mode, useful for debugging
757 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
758 {
759 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
760 simdvector line[2];
761 simdscalar recipW[2];
762 line[0] = tri[0];
763 line[1] = tri[1];
764 recipW[0] = vRecipW0;
765 recipW[1] = vRecipW1;
766 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
767
768 line[0] = tri[1];
769 line[1] = tri[2];
770 recipW[0] = vRecipW1;
771 recipW[1] = vRecipW2;
772 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
773
774 line[0] = tri[2];
775 line[1] = tri[0];
776 recipW[0] = vRecipW2;
777 recipW[1] = vRecipW0;
778 BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
779
780 AR_END(FEBinTriangles, 1);
781 return;
782 } else if (rastState.fillMode == SWR_FILLMODE_POINT)
783 {
784 // bin 3 points
785
786 BinPostSetupPoints(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
787 BinPostSetupPoints(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
788 BinPostSetupPoints(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
789 return;
790 }
791
792 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
793 // compute per tri backface
794 uint32_t frontFaceMask = frontWindingTris;
795 uint32_t *pPrimID = (uint32_t *)&primID;
796 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
797 DWORD triIndex = 0;
798 uint32_t edgeEnable;
799 PFN_WORK_FUNC pfnWork;
800 if (CT::IsConservativeT::value)
801 {
802 // determine which edges of the degenerate tri, if any, are valid to rasterize.
803 // used to call the appropriate templated rasterizer function
804 if (cullZeroAreaMask > 0)
805 {
806 // e0 = v1-v0
807 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
808 simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
809 uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
810
811 // e1 = v2-v1
812 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
813 simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
814 uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
815
816 // e2 = v0-v2
817 // if v0 == v1 & v1 == v2, v0 == v2
818 uint32_t e2Mask = e0Mask & e1Mask;
819 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
820
821 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
822 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
823 e0Mask = pdep_u32(e0Mask, 0x00249249);
824 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
825 e1Mask = pdep_u32(e1Mask, 0x00492492);
826 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
827 e2Mask = pdep_u32(e2Mask, 0x00924924);
828
829 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
830 }
831 else
832 {
833 edgeEnable = 0x00FFFFFF;
834 }
835 }
836 else
837 {
838 // degenerate triangles won't be sent to rasterizer; just enable all edges
839 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
840 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
841 }
842
843 if (!triMask)
844 {
845 goto endBinTriangles;
846 }
847
848 // Calc bounding box of triangles
849 simdBBox bbox;
850 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
851
852 // determine if triangle falls between pixel centers and discard
853 // only discard for non-MSAA case and when conservative rast is disabled
854 // (xmin + 127) & ~255
855 // (xmax + 128) & ~255
856 if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
857 (!CT::IsConservativeT::value))
858 {
859 origTriMask = triMask;
860
861 int cullCenterMask;
862 {
863 simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
864 xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
865 simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
866 xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
867
868 simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
869
870 simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
871 ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
872 simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
873 ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
874
875 simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
876 vMaskV = _simd_or_si(vMaskH, vMaskV);
877 cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
878 }
879
880 triMask &= ~cullCenterMask;
881
882 if (origTriMask ^ triMask)
883 {
884 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
885 }
886 }
887
888 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
889 // Gather the AOS effective scissor rects based on the per-prim VP index.
890 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
891 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
892 if (state.gsState.emitsViewportArrayIndex)
893 {
894 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
895 scisXmin, scisYmin, scisXmax, scisYmax);
896 }
897 else // broadcast fast path for non-VPAI case.
898 {
899 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
900 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
901 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
902 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
903 }
904
905 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
906 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
907 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
908 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
909
910 if (CT::IsConservativeT::value)
911 {
912 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
913 // some area. Bump the xmax/ymax edges out
914 simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
915 bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
916 simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
917 bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
918 }
919
920 // Cull tris completely outside scissor
921 {
922 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
923 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
924 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
925 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
926 triMask = triMask & ~maskOutsideScissor;
927 }
928
929 if (!triMask)
930 {
931 goto endBinTriangles;
932 }
933
934 // Convert triangle bbox to macrotile units.
935 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
936 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
937 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
938 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
939
940 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
941 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
942 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
943 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
944 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
945
946 // transpose verts needed for backend
947 /// @todo modify BE to take non-transformed verts
948 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
949 vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
950 vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
951 vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
952 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
953
954 // store render target array index
955 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
956 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
957 {
958 simdvector vRtai[3];
959 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
960 simdscalari vRtaii;
961 vRtaii = _simd_castps_si(vRtai[0].x);
962 _simd_store_si((simdscalari*)aRTAI, vRtaii);
963 }
964 else
965 {
966 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
967 }
968
969 endBinTriangles:
970
971 // scan remaining valid triangles and bin each separately
972 while (_BitScanForward(&triIndex, triMask))
973 {
974 uint32_t linkageCount = state.backendState.numAttributes;
975 uint32_t numScalarAttribs = linkageCount * 4;
976
977 BE_WORK work;
978 work.type = DRAW;
979
980 bool isDegenerate;
981 if (CT::IsConservativeT::value)
982 {
983 // only rasterize valid edges if we have a degenerate primitive
984 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
985 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
986 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
987
988 // Degenerate triangles are required to be constant interpolated
989 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
990 }
991 else
992 {
993 isDegenerate = false;
994 work.pfnWork = pfnWork;
995 }
996
997 // Select attribute processor
998 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
999 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1000
1001 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1002
1003 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1004 desc.triFlags.primID = pPrimID[triIndex];
1005 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1006 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1007
1008 auto pArena = pDC->pArena;
1009 SWR_ASSERT(pArena != nullptr);
1010
1011 // store active attribs
1012 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1013 desc.pAttribs = pAttribs;
1014 desc.numAttribs = linkageCount;
1015 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1016
1017 // store triangle vertex data
1018 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1019
1020 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1021 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1022 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1023 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1024
1025 // store user clip distances
1026 if (rastState.clipDistanceMask)
1027 {
1028 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1029 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1030 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1031 }
1032
1033 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1034 {
1035 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1036 {
1037 #if KNOB_ENABLE_TOSS_POINTS
1038 if (!KNOB_TOSS_SETUP_TRIS)
1039 #endif
1040 {
1041 pTileMgr->enqueue(x, y, &work);
1042 }
1043 }
1044 }
1045 triMask &= ~(1 << triIndex);
1046 }
1047
1048 AR_END(FEBinTriangles, 1);
1049 }
1050
1051 #if USE_SIMD16_FRONTEND
1052 template <typename CT>
1053 void SIMDAPI BinTriangles_simd16(
1054 DRAW_CONTEXT *pDC,
1055 PA_STATE& pa,
1056 uint32_t workerId,
1057 simd16vector tri[3],
1058 uint32_t triMask,
1059 simd16scalari primID,
1060 simd16scalari viewportIdx)
1061 {
1062 SWR_CONTEXT *pContext = pDC->pContext;
1063
1064 AR_BEGIN(FEBinTriangles, pDC->drawId);
1065
1066 const API_STATE& state = GetApiState(pDC);
1067 const SWR_RASTSTATE& rastState = state.rastState;
1068 const SWR_FRONTEND_STATE& feState = state.frontendState;
1069 const SWR_GS_STATE& gsState = state.gsState;
1070
1071 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1072
1073 simd16scalar vRecipW0 = _simd16_set1_ps(1.0f);
1074 simd16scalar vRecipW1 = _simd16_set1_ps(1.0f);
1075 simd16scalar vRecipW2 = _simd16_set1_ps(1.0f);
1076
1077 if (feState.vpTransformDisable)
1078 {
1079 // RHW is passed in directly when VP transform is disabled
1080 vRecipW0 = tri[0].v[3];
1081 vRecipW1 = tri[1].v[3];
1082 vRecipW2 = tri[2].v[3];
1083 }
1084 else
1085 {
1086 // Perspective divide
1087 vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[0].w);
1088 vRecipW1 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[1].w);
1089 vRecipW2 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[2].w);
1090
1091 tri[0].v[0] = _simd16_mul_ps(tri[0].v[0], vRecipW0);
1092 tri[1].v[0] = _simd16_mul_ps(tri[1].v[0], vRecipW1);
1093 tri[2].v[0] = _simd16_mul_ps(tri[2].v[0], vRecipW2);
1094
1095 tri[0].v[1] = _simd16_mul_ps(tri[0].v[1], vRecipW0);
1096 tri[1].v[1] = _simd16_mul_ps(tri[1].v[1], vRecipW1);
1097 tri[2].v[1] = _simd16_mul_ps(tri[2].v[1], vRecipW2);
1098
1099 tri[0].v[2] = _simd16_mul_ps(tri[0].v[2], vRecipW0);
1100 tri[1].v[2] = _simd16_mul_ps(tri[1].v[2], vRecipW1);
1101 tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2);
1102
1103 // Viewport transform to screen space coords
1104 if (state.gsState.emitsViewportArrayIndex)
1105 {
1106 viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1107 }
1108 else
1109 {
1110 viewportTransform<3>(tri, state.vpMatrices);
1111 }
1112 }
1113
1114 // Adjust for pixel center location
1115 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
1116
1117 tri[0].x = _simd16_add_ps(tri[0].x, offset);
1118 tri[0].y = _simd16_add_ps(tri[0].y, offset);
1119
1120 tri[1].x = _simd16_add_ps(tri[1].x, offset);
1121 tri[1].y = _simd16_add_ps(tri[1].y, offset);
1122
1123 tri[2].x = _simd16_add_ps(tri[2].x, offset);
1124 tri[2].y = _simd16_add_ps(tri[2].y, offset);
1125
1126 simd16scalari vXi[3], vYi[3];
1127
1128 // Set vXi, vYi to required fixed point precision
1129 FPToFixedPoint(tri, vXi, vYi);
1130
1131 // triangle setup
1132 simd16scalari vAi[3], vBi[3];
1133 triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1134
1135 // determinant
1136 simd16scalari vDet[2];
1137 calcDeterminantIntVertical(vAi, vBi, vDet);
1138
1139 // cull zero area
1140 uint32_t maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si())));
1141 uint32_t maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si())));
1142
1143 uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
1144
1145 // don't cull degenerate triangles if we're conservatively rasterizing
1146 uint32_t origTriMask = triMask;
1147 if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
1148 {
1149 triMask &= ~cullZeroAreaMask;
1150 }
1151
1152 // determine front winding tris
1153 // CW +det
1154 // CCW det < 0;
1155 // 0 area triangles are marked as backfacing regardless of winding order,
1156 // which is required behavior for conservative rast and wireframe rendering
1157 uint32_t frontWindingTris;
1158 if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1159 {
1160 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[0], _simd16_setzero_si())));
1161 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[1], _simd16_setzero_si())));
1162 }
1163 else
1164 {
1165 maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0])));
1166 maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1])));
1167 }
1168 frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
1169
1170 // cull
1171 uint32_t cullTris;
1172 switch ((SWR_CULLMODE)rastState.cullMode)
1173 {
1174 case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
1175 case SWR_CULLMODE_NONE: cullTris = 0x0; break;
1176 case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1177 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1178 case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
1179 default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1180 }
1181
1182 triMask &= ~cullTris;
1183
1184 if (origTriMask ^ triMask)
1185 {
1186 RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1187 }
1188
1189 // Simple non-conformant wireframe mode, useful for debugging
1190 if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
1191 {
1192 // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1193 simd16vector line[2];
1194 simd16scalar recipW[2];
1195 line[0] = tri[0];
1196 line[1] = tri[1];
1197 recipW[0] = vRecipW0;
1198 recipW[1] = vRecipW1;
1199 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1200
1201 line[0] = tri[1];
1202 line[1] = tri[2];
1203 recipW[0] = vRecipW1;
1204 recipW[1] = vRecipW2;
1205 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1206
1207 line[0] = tri[2];
1208 line[1] = tri[0];
1209 recipW[0] = vRecipW2;
1210 recipW[1] = vRecipW0;
1211 BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
1212
1213 AR_END(FEBinTriangles, 1);
1214 return;
1215 }
1216
1217 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1218 // compute per tri backface
1219 uint32_t frontFaceMask = frontWindingTris;
1220 uint32_t *pPrimID = (uint32_t *)&primID;
1221 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1222 DWORD triIndex = 0;
1223
1224 uint32_t edgeEnable;
1225 PFN_WORK_FUNC pfnWork;
1226 if (CT::IsConservativeT::value)
1227 {
1228 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1229 // used to call the appropriate templated rasterizer function
1230 if (cullZeroAreaMask > 0)
1231 {
1232 // e0 = v1-v0
1233 const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]);
1234 const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]);
1235
1236 uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask)));
1237
1238 // e1 = v2-v1
1239 const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]);
1240 const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]);
1241
1242 uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask)));
1243
1244 // e2 = v0-v2
1245 // if v0 == v1 & v1 == v2, v0 == v2
1246 uint32_t e2Mask = e0Mask & e1Mask;
1247 SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1248
1249 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1250 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1251 e0Mask = pdep_u32(e0Mask, 0x00249249);
1252
1253 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1254 e1Mask = pdep_u32(e1Mask, 0x00492492);
1255
1256 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1257 e2Mask = pdep_u32(e2Mask, 0x00924924);
1258
1259 edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1260 }
1261 else
1262 {
1263 edgeEnable = 0x00FFFFFF;
1264 }
1265 }
1266 else
1267 {
1268 // degenerate triangles won't be sent to rasterizer; just enable all edges
1269 pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1270 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
1271 }
1272
1273 if (!triMask)
1274 {
1275 goto endBinTriangles;
1276 }
1277
1278 // Calc bounding box of triangles
1279 simd16BBox bbox;
1280 calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1281
1282 // determine if triangle falls between pixel centers and discard
1283 // only discard for non-MSAA case and when conservative rast is disabled
1284 // (xmin + 127) & ~255
1285 // (xmax + 128) & ~255
1286 if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
1287 (!CT::IsConservativeT::value))
1288 {
1289 origTriMask = triMask;
1290
1291 int cullCenterMask;
1292
1293 {
1294 simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127));
1295 xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255));
1296 simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128));
1297 xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255));
1298
1299 simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax);
1300
1301 simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127));
1302 ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255));
1303 simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128));
1304 ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255));
1305
1306 simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax);
1307
1308 vMaskV = _simd16_or_si(vMaskH, vMaskV);
1309 cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV));
1310 }
1311
1312 triMask &= ~cullCenterMask;
1313
1314 if (origTriMask ^ triMask)
1315 {
1316 RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1317 }
1318 }
1319
1320 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1321 // Gather the AOS effective scissor rects based on the per-prim VP index.
1322 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1323 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
1324
1325 if (state.gsState.emitsViewportArrayIndex)
1326 {
1327 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1328 scisXmin, scisYmin, scisXmax, scisYmax);
1329 }
1330 else // broadcast fast path for non-VPAI case.
1331 {
1332 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1333 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1334 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1335 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1336 }
1337
1338 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
1339 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
1340 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
1341 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
1342
1343 if (CT::IsConservativeT::value)
1344 {
1345 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1346 // some area. Bump the xmax/ymax edges out
1347 simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax);
1348 bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom);
1349 simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax);
1350 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight);
1351 }
1352
1353 // Cull tris completely outside scissor
1354 {
1355 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
1356 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
1357 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
1358 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
1359 triMask = triMask & ~maskOutsideScissor;
1360 }
1361
1362 if (!triMask)
1363 {
1364 goto endBinTriangles;
1365 }
1366
1367 // Convert triangle bbox to macrotile units.
1368 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1369 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1370 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1371 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1372
1373 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
1374
1375 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
1376 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
1377 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
1378 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
1379
1380 // transpose verts needed for backend
1381 /// @todo modify BE to take non-transformed verts
1382 __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1383 __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1384 __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1385 __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
1386
1387 vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
1388 vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
1389 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0));
1390 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0));
1391
1392 vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1));
1393 vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1));
1394 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1));
1395 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1));
1396
1397 // store render target array index
1398 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1399 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1400 {
1401 simd16vector vRtai[3];
1402 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
1403 simd16scalari vRtaii;
1404 vRtaii = _simd16_castps_si(vRtai[0].x);
1405 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1406 }
1407 else
1408 {
1409 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1410 }
1411
1412 endBinTriangles:
1413
1414
1415 // scan remaining valid triangles and bin each separately
1416 while (_BitScanForward(&triIndex, triMask))
1417 {
1418 uint32_t linkageCount = state.backendState.numAttributes;
1419 uint32_t numScalarAttribs = linkageCount * 4;
1420
1421 BE_WORK work;
1422 work.type = DRAW;
1423
1424 bool isDegenerate;
1425 if (CT::IsConservativeT::value)
1426 {
1427 // only rasterize valid edges if we have a degenerate primitive
1428 int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1429 work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1430 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1431
1432 // Degenerate triangles are required to be constant interpolated
1433 isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1434 }
1435 else
1436 {
1437 isDegenerate = false;
1438 work.pfnWork = pfnWork;
1439 }
1440
1441 // Select attribute processor
1442 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1443 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1444
1445 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1446
1447 desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1448 desc.triFlags.primID = pPrimID[triIndex];
1449 desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1450 desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1451
1452 auto pArena = pDC->pArena;
1453 SWR_ASSERT(pArena != nullptr);
1454
1455 // store active attribs
1456 float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1457 desc.pAttribs = pAttribs;
1458 desc.numAttribs = linkageCount;
1459 pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1460
1461 // store triangle vertex data
1462 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1463
1464 {
1465 const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
1466 const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH
1467
1468 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
1469 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
1470 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
1471 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
1472 }
1473
1474 // store user clip distances
1475 if (rastState.clipDistanceMask)
1476 {
1477 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1478 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1479 ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1480 }
1481
1482 for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1483 {
1484 for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1485 {
1486 #if KNOB_ENABLE_TOSS_POINTS
1487 if (!KNOB_TOSS_SETUP_TRIS)
1488 #endif
1489 {
1490 pTileMgr->enqueue(x, y, &work);
1491 }
1492 }
1493 }
1494
1495 triMask &= ~(1 << triIndex);
1496 }
1497
1498 AR_END(FEBinTriangles, 1);
1499 }
1500
1501 #endif
1502 struct FEBinTrianglesChooser
1503 {
1504 typedef PFN_PROCESS_PRIMS FuncType;
1505
1506 template <typename... ArgsB>
1507 static FuncType GetFunc()
1508 {
1509 return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1510 }
1511 };
1512
1513 // Selector for correct templated BinTrinagles function
1514 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1515 {
1516 return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1517 }
1518
1519 #if USE_SIMD16_FRONTEND
1520 struct FEBinTrianglesChooser_simd16
1521 {
1522 typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1523
1524 template <typename... ArgsB>
1525 static FuncType GetFunc()
1526 {
1527 return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1528 }
1529 };
1530
1531 // Selector for correct templated BinTrinagles function
1532 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1533 {
1534 return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1535 }
1536
1537 #endif
1538
1539 void BinPostSetupPoints(
1540 DRAW_CONTEXT *pDC,
1541 PA_STATE& pa,
1542 uint32_t workerId,
1543 simdvector prim[],
1544 uint32_t primMask,
1545 simdscalari primID,
1546 simdscalari viewportIdx)
1547 {
1548 SWR_CONTEXT *pContext = pDC->pContext;
1549
1550 AR_BEGIN(FEBinPoints, pDC->drawId);
1551
1552 simdvector& primVerts = prim[0];
1553
1554 const API_STATE& state = GetApiState(pDC);
1555 const SWR_GS_STATE& gsState = state.gsState;
1556 const SWR_RASTSTATE& rastState = state.rastState;
1557 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1558
1559 // Select attribute processor
1560 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1561 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1562
1563 // convert to fixed point
1564 simdscalari vXi, vYi;
1565 vXi = fpToFixedPointVertical(primVerts.x);
1566 vYi = fpToFixedPointVertical(primVerts.y);
1567
1568 if (CanUseSimplePoints(pDC))
1569 {
1570 // adjust for ymin-xmin rule
1571 vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1572 vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1573
1574 // cull points off the ymin-xmin edge of the viewport
1575 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1576 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1577
1578 // compute macro tile coordinates
1579 simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1580 simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1581
1582 OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1583 _simd_store_si((simdscalari*)aMacroX, macroX);
1584 _simd_store_si((simdscalari*)aMacroY, macroY);
1585
1586 // compute raster tile coordinates
1587 simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1588 simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1589
1590 // compute raster tile relative x,y for coverage mask
1591 simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1592 simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1593
1594 simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1595 simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1596
1597 OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1598 OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1599 _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1600 _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1601
1602 OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1603 OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1604 _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1605 _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1606
1607 OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1608 _simd_store_ps((float*)aZ, primVerts.z);
1609
1610 // store render target array index
1611 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1612 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1613 {
1614 simdvector vRtai;
1615 pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
1616 simdscalari vRtaii = _simd_castps_si(vRtai.x);
1617 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1618 }
1619 else
1620 {
1621 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1622 }
1623
1624 uint32_t *pPrimID = (uint32_t *)&primID;
1625 DWORD primIndex = 0;
1626
1627 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1628
1629 // scan remaining valid triangles and bin each separately
1630 while (_BitScanForward(&primIndex, primMask))
1631 {
1632 uint32_t linkageCount = backendState.numAttributes;
1633 uint32_t numScalarAttribs = linkageCount * 4;
1634
1635 BE_WORK work;
1636 work.type = DRAW;
1637
1638 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1639
1640 // points are always front facing
1641 desc.triFlags.frontFacing = 1;
1642 desc.triFlags.primID = pPrimID[primIndex];
1643 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1644 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1645
1646 work.pfnWork = RasterizeSimplePoint;
1647
1648 auto pArena = pDC->pArena;
1649 SWR_ASSERT(pArena != nullptr);
1650
1651 // store attributes
1652 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1653 desc.pAttribs = pAttribs;
1654 desc.numAttribs = linkageCount;
1655
1656 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1657
1658 // store raster tile aligned x, y, perspective correct z
1659 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1660 desc.pTriBuffer = pTriBuffer;
1661 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1662 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1663 *pTriBuffer = aZ[primIndex];
1664
1665 uint32_t tX = aTileRelativeX[primIndex];
1666 uint32_t tY = aTileRelativeY[primIndex];
1667
1668 // pack the relative x,y into the coverageMask, the rasterizer will
1669 // generate the true coverage mask from it
1670 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1671
1672 // bin it
1673 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1674 #if KNOB_ENABLE_TOSS_POINTS
1675 if (!KNOB_TOSS_SETUP_TRIS)
1676 #endif
1677 {
1678 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1679 }
1680 primMask &= ~(1 << primIndex);
1681 }
1682 }
1683 else
1684 {
1685 // non simple points need to be potentially binned to multiple macro tiles
1686 simdscalar vPointSize;
1687 if (rastState.pointParam)
1688 {
1689 simdvector size[3];
1690 pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
1691 vPointSize = size[0].x;
1692 }
1693 else
1694 {
1695 vPointSize = _simd_set1_ps(rastState.pointSize);
1696 }
1697
1698 // bloat point to bbox
1699 simdBBox bbox;
1700 bbox.xmin = bbox.xmax = vXi;
1701 bbox.ymin = bbox.ymax = vYi;
1702
1703 simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
1704 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
1705 bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
1706 bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
1707 bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
1708 bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
1709
1710 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1711 // Gather the AOS effective scissor rects based on the per-prim VP index.
1712 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
1713 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
1714 if (state.gsState.emitsViewportArrayIndex)
1715 {
1716 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
1717 scisXmin, scisYmin, scisXmax, scisYmax);
1718 }
1719 else // broadcast fast path for non-VPAI case.
1720 {
1721 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
1722 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
1723 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
1724 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
1725 }
1726
1727 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
1728 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
1729 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
1730 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
1731
1732 // Cull bloated points completely outside scissor
1733 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
1734 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
1735 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1736 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1737 primMask = primMask & ~maskOutsideScissor;
1738
1739 // Convert bbox to macrotile units.
1740 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1741 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1742 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1743 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1744
1745 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1746 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
1747 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
1748 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
1749 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
1750
1751 // store render target array index
1752 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1753 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1754 {
1755 simdvector vRtai[2];
1756 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1757 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
1758 _simd_store_si((simdscalari*)aRTAI, vRtaii);
1759 }
1760 else
1761 {
1762 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1763 }
1764
1765 OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
1766 _simd_store_ps((float*)aPointSize, vPointSize);
1767
1768 uint32_t *pPrimID = (uint32_t *)&primID;
1769
1770 OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
1771 OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
1772 OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
1773
1774 _simd_store_ps((float*)aPrimVertsX, primVerts.x);
1775 _simd_store_ps((float*)aPrimVertsY, primVerts.y);
1776 _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
1777
1778 // scan remaining valid prims and bin each separately
1779 const SWR_BACKEND_STATE& backendState = state.backendState;
1780 DWORD primIndex;
1781 while (_BitScanForward(&primIndex, primMask))
1782 {
1783 uint32_t linkageCount = backendState.numAttributes;
1784 uint32_t numScalarAttribs = linkageCount * 4;
1785
1786 BE_WORK work;
1787 work.type = DRAW;
1788
1789 TRIANGLE_WORK_DESC &desc = work.desc.tri;
1790
1791 desc.triFlags.frontFacing = 1;
1792 desc.triFlags.primID = pPrimID[primIndex];
1793 desc.triFlags.pointSize = aPointSize[primIndex];
1794 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1795 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1796
1797 work.pfnWork = RasterizeTriPoint;
1798
1799 auto pArena = pDC->pArena;
1800 SWR_ASSERT(pArena != nullptr);
1801
1802 // store active attribs
1803 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1804 desc.numAttribs = linkageCount;
1805 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1806
1807 // store point vertex data
1808 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1809 desc.pTriBuffer = pTriBuffer;
1810 *pTriBuffer++ = aPrimVertsX[primIndex];
1811 *pTriBuffer++ = aPrimVertsY[primIndex];
1812 *pTriBuffer = aPrimVertsZ[primIndex];
1813
1814 // store user clip distances
1815 if (rastState.clipDistanceMask)
1816 {
1817 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1818 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1819 float dists[8];
1820 float one = 1.0f;
1821 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
1822 for (uint32_t i = 0; i < numClipDist; i++) {
1823 desc.pUserClipBuffer[3*i + 0] = 0.0f;
1824 desc.pUserClipBuffer[3*i + 1] = 0.0f;
1825 desc.pUserClipBuffer[3*i + 2] = dists[i];
1826 }
1827 }
1828
1829 MacroTileMgr *pTileMgr = pDC->pTileMgr;
1830 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1831 {
1832 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1833 {
1834 #if KNOB_ENABLE_TOSS_POINTS
1835 if (!KNOB_TOSS_SETUP_TRIS)
1836 #endif
1837 {
1838 pTileMgr->enqueue(x, y, &work);
1839 }
1840 }
1841 }
1842
1843 primMask &= ~(1 << primIndex);
1844 }
1845 }
1846
1847 AR_END(FEBinPoints, 1);
1848 }
1849
1850 //////////////////////////////////////////////////////////////////////////
1851 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1852 /// @param pDC - pointer to draw context.
1853 /// @param pa - The primitive assembly object.
1854 /// @param workerId - thread's worker id. Even thread has a unique id.
1855 /// @param tri - Contains point position data for SIMDs worth of points.
1856 /// @param primID - Primitive ID for each point.
1857 void BinPoints(
1858 DRAW_CONTEXT *pDC,
1859 PA_STATE& pa,
1860 uint32_t workerId,
1861 simdvector prim[3],
1862 uint32_t primMask,
1863 simdscalari primID,
1864 simdscalari viewportIdx)
1865 {
1866 simdvector& primVerts = prim[0];
1867
1868 const API_STATE& state = GetApiState(pDC);
1869 const SWR_FRONTEND_STATE& feState = state.frontendState;
1870 const SWR_RASTSTATE& rastState = state.rastState;
1871
1872 if (!feState.vpTransformDisable)
1873 {
1874 // perspective divide
1875 simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1876 primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1877 primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1878 primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1879
1880 // viewport transform to screen coords
1881 if (state.gsState.emitsViewportArrayIndex)
1882 {
1883 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
1884 }
1885 else
1886 {
1887 viewportTransform<1>(&primVerts, state.vpMatrices);
1888 }
1889 }
1890
1891 // adjust for pixel center location
1892 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1893 primVerts.x = _simd_add_ps(primVerts.x, offset);
1894 primVerts.y = _simd_add_ps(primVerts.y, offset);
1895
1896 BinPostSetupPoints(
1897 pDC,
1898 pa,
1899 workerId,
1900 prim,
1901 primMask,
1902 primID,
1903 viewportIdx);
1904 }
1905
1906 #if USE_SIMD16_FRONTEND
1907 void BinPostSetupPoints_simd16(
1908 DRAW_CONTEXT *pDC,
1909 PA_STATE& pa,
1910 uint32_t workerId,
1911 simd16vector prim[],
1912 uint32_t primMask,
1913 simd16scalari primID,
1914 simd16scalari viewportIdx)
1915 {
1916 SWR_CONTEXT *pContext = pDC->pContext;
1917
1918 AR_BEGIN(FEBinPoints, pDC->drawId);
1919
1920 simd16vector& primVerts = prim[0];
1921
1922 const API_STATE& state = GetApiState(pDC);
1923 const SWR_GS_STATE& gsState = state.gsState;
1924 const SWR_RASTSTATE& rastState = state.rastState;
1925 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1926
1927 // Select attribute processor
1928 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1929 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1930
1931 // convert to fixed point
1932 simd16scalari vXi, vYi;
1933
1934 vXi = fpToFixedPointVertical(primVerts.x);
1935 vYi = fpToFixedPointVertical(primVerts.y);
1936
1937 if (CanUseSimplePoints(pDC))
1938 {
1939 // adjust for ymin-xmin rule
1940 vXi = _simd16_sub_epi32(vXi, _simd16_set1_epi32(1));
1941 vYi = _simd16_sub_epi32(vYi, _simd16_set1_epi32(1));
1942
1943 // cull points off the ymin-xmin edge of the viewport
1944 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi));
1945 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi));
1946
1947 // compute macro tile coordinates
1948 simd16scalari macroX = _simd16_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1949 simd16scalari macroY = _simd16_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1950
1951 OSALIGNSIMD16(uint32_t) aMacroX[KNOB_SIMD16_WIDTH], aMacroY[KNOB_SIMD16_WIDTH];
1952
1953 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroX), macroX);
1954 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroY), macroY);
1955
1956 // compute raster tile coordinates
1957 simd16scalari rasterX = _simd16_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1958 simd16scalari rasterY = _simd16_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1959
1960 // compute raster tile relative x,y for coverage mask
1961 simd16scalari tileAlignedX = _simd16_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1962 simd16scalari tileAlignedY = _simd16_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1963
1964 simd16scalari tileRelativeX = _simd16_sub_epi32(_simd16_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1965 simd16scalari tileRelativeY = _simd16_sub_epi32(_simd16_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1966
1967 OSALIGNSIMD16(uint32_t) aTileRelativeX[KNOB_SIMD16_WIDTH];
1968 OSALIGNSIMD16(uint32_t) aTileRelativeY[KNOB_SIMD16_WIDTH];
1969
1970 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeX), tileRelativeX);
1971 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeY), tileRelativeY);
1972
1973 OSALIGNSIMD16(uint32_t) aTileAlignedX[KNOB_SIMD16_WIDTH];
1974 OSALIGNSIMD16(uint32_t) aTileAlignedY[KNOB_SIMD16_WIDTH];
1975
1976 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedX), tileAlignedX);
1977 _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedY), tileAlignedY);
1978
1979 OSALIGNSIMD16(float) aZ[KNOB_SIMD16_WIDTH];
1980 _simd16_store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1981
1982 // store render target array index
1983 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
1984 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1985 {
1986 simd16vector vRtai;
1987 pa.Assemble_simd16(VERTEX_RTAI_SLOT, &vRtai);
1988 simd16scalari vRtaii = _simd16_castps_si(vRtai.x);
1989 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
1990 }
1991 else
1992 {
1993 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
1994 }
1995
1996 uint32_t *pPrimID = (uint32_t *)&primID;
1997 DWORD primIndex = 0;
1998
1999 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2000
2001 // scan remaining valid triangles and bin each separately
2002 while (_BitScanForward(&primIndex, primMask))
2003 {
2004 uint32_t linkageCount = backendState.numAttributes;
2005 uint32_t numScalarAttribs = linkageCount * 4;
2006
2007 BE_WORK work;
2008 work.type = DRAW;
2009
2010 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2011
2012 // points are always front facing
2013 desc.triFlags.frontFacing = 1;
2014 desc.triFlags.primID = pPrimID[primIndex];
2015 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2016 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2017
2018 work.pfnWork = RasterizeSimplePoint;
2019
2020 auto pArena = pDC->pArena;
2021 SWR_ASSERT(pArena != nullptr);
2022
2023 // store attributes
2024 float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2025 desc.pAttribs = pAttribs;
2026 desc.numAttribs = linkageCount;
2027
2028 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2029
2030 // store raster tile aligned x, y, perspective correct z
2031 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2032 desc.pTriBuffer = pTriBuffer;
2033 *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2034 *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2035 *pTriBuffer = aZ[primIndex];
2036
2037 uint32_t tX = aTileRelativeX[primIndex];
2038 uint32_t tY = aTileRelativeY[primIndex];
2039
2040 // pack the relative x,y into the coverageMask, the rasterizer will
2041 // generate the true coverage mask from it
2042 work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2043
2044 // bin it
2045 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2046 #if KNOB_ENABLE_TOSS_POINTS
2047 if (!KNOB_TOSS_SETUP_TRIS)
2048 #endif
2049 {
2050 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2051 }
2052
2053 primMask &= ~(1 << primIndex);
2054 }
2055 }
2056 else
2057 {
2058 // non simple points need to be potentially binned to multiple macro tiles
2059 simd16scalar vPointSize;
2060
2061 if (rastState.pointParam)
2062 {
2063 simd16vector size[3];
2064 pa.Assemble_simd16(VERTEX_POINT_SIZE_SLOT, size);
2065 vPointSize = size[0].x;
2066 }
2067 else
2068 {
2069 vPointSize = _simd16_set1_ps(rastState.pointSize);
2070 }
2071
2072 // bloat point to bbox
2073 simd16BBox bbox;
2074
2075 bbox.xmin = bbox.xmax = vXi;
2076 bbox.ymin = bbox.ymax = vYi;
2077
2078 simd16scalar vHalfWidth = _simd16_mul_ps(vPointSize, _simd16_set1_ps(0.5f));
2079 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2080
2081 bbox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
2082 bbox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
2083 bbox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
2084 bbox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
2085
2086 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2087 // Gather the AOS effective scissor rects based on the per-prim VP index.
2088 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2089 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
2090 if (state.gsState.emitsViewportArrayIndex)
2091 {
2092 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2093 scisXmin, scisYmin, scisXmax, scisYmax);
2094 }
2095 else // broadcast fast path for non-VPAI case.
2096 {
2097 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2098 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2099 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2100 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2101 }
2102
2103 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
2104 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
2105 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
2106 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
2107
2108 // Cull bloated points completely outside scissor
2109 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
2110 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
2111 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
2112 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
2113 primMask = primMask & ~maskOutsideScissor;
2114
2115 // Convert bbox to macrotile units.
2116 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2117 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2118 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2119 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2120
2121 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
2122
2123 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
2124 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
2125 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
2126 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
2127
2128 // store render target array index
2129 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
2130 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2131 {
2132 simd16vector vRtai[2];
2133 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
2134 simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
2135 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
2136 }
2137 else
2138 {
2139 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
2140 }
2141
2142 OSALIGNSIMD16(float) aPointSize[KNOB_SIMD16_WIDTH];
2143 _simd16_store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
2144
2145 uint32_t *pPrimID = (uint32_t *)&primID;
2146
2147 OSALIGNSIMD16(float) aPrimVertsX[KNOB_SIMD16_WIDTH];
2148 OSALIGNSIMD16(float) aPrimVertsY[KNOB_SIMD16_WIDTH];
2149 OSALIGNSIMD16(float) aPrimVertsZ[KNOB_SIMD16_WIDTH];
2150
2151 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
2152 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
2153 _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
2154
2155 // scan remaining valid prims and bin each separately
2156 const SWR_BACKEND_STATE& backendState = state.backendState;
2157 DWORD primIndex;
2158 while (_BitScanForward(&primIndex, primMask))
2159 {
2160 uint32_t linkageCount = backendState.numAttributes;
2161 uint32_t numScalarAttribs = linkageCount * 4;
2162
2163 BE_WORK work;
2164 work.type = DRAW;
2165
2166 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2167
2168 desc.triFlags.frontFacing = 1;
2169 desc.triFlags.primID = pPrimID[primIndex];
2170 desc.triFlags.pointSize = aPointSize[primIndex];
2171 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2172 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2173
2174 work.pfnWork = RasterizeTriPoint;
2175
2176 auto pArena = pDC->pArena;
2177 SWR_ASSERT(pArena != nullptr);
2178
2179 // store active attribs
2180 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2181 desc.numAttribs = linkageCount;
2182 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2183
2184 // store point vertex data
2185 float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2186 desc.pTriBuffer = pTriBuffer;
2187 *pTriBuffer++ = aPrimVertsX[primIndex];
2188 *pTriBuffer++ = aPrimVertsY[primIndex];
2189 *pTriBuffer = aPrimVertsZ[primIndex];
2190
2191 // store user clip distances
2192 if (rastState.clipDistanceMask)
2193 {
2194 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2195 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2196 float dists[8];
2197 float one = 1.0f;
2198 ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
2199 for (uint32_t i = 0; i < numClipDist; i++) {
2200 desc.pUserClipBuffer[3 * i + 0] = 0.0f;
2201 desc.pUserClipBuffer[3 * i + 1] = 0.0f;
2202 desc.pUserClipBuffer[3 * i + 2] = dists[i];
2203 }
2204 }
2205
2206 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2207 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2208 {
2209 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2210 {
2211 #if KNOB_ENABLE_TOSS_POINTS
2212 if (!KNOB_TOSS_SETUP_TRIS)
2213 #endif
2214 {
2215 pTileMgr->enqueue(x, y, &work);
2216 }
2217 }
2218 }
2219
2220 primMask &= ~(1 << primIndex);
2221 }
2222 }
2223
2224 AR_END(FEBinPoints, 1);
2225 }
2226
2227 void SIMDAPI BinPoints_simd16(
2228 DRAW_CONTEXT *pDC,
2229 PA_STATE& pa,
2230 uint32_t workerId,
2231 simd16vector prim[3],
2232 uint32_t primMask,
2233 simd16scalari primID,
2234 simd16scalari viewportIdx)
2235 {
2236 simd16vector& primVerts = prim[0];
2237
2238 const API_STATE& state = GetApiState(pDC);
2239 const SWR_FRONTEND_STATE& feState = state.frontendState;
2240 const SWR_RASTSTATE& rastState = state.rastState;
2241
2242 if (!feState.vpTransformDisable)
2243 {
2244 // perspective divide
2245 simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w);
2246
2247 primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0);
2248 primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0);
2249 primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
2250
2251 // viewport transform to screen coords
2252 if (state.gsState.emitsViewportArrayIndex)
2253 {
2254 viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2255 }
2256 else
2257 {
2258 viewportTransform<1>(&primVerts, state.vpMatrices);
2259 }
2260 }
2261
2262 const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
2263
2264 primVerts.x = _simd16_add_ps(primVerts.x, offset);
2265 primVerts.y = _simd16_add_ps(primVerts.y, offset);
2266
2267 BinPostSetupPoints_simd16(
2268 pDC,
2269 pa,
2270 workerId,
2271 prim,
2272 primMask,
2273 primID,
2274 viewportIdx);
2275 }
2276
2277 #endif
2278 //////////////////////////////////////////////////////////////////////////
2279 /// @brief Bin SIMD lines to the backend.
2280 /// @param pDC - pointer to draw context.
2281 /// @param pa - The primitive assembly object.
2282 /// @param workerId - thread's worker id. Even thread has a unique id.
2283 /// @param tri - Contains line position data for SIMDs worth of points.
2284 /// @param primID - Primitive ID for each line.
2285 /// @param viewportIdx - Viewport Array Index for each line.
2286 void BinPostSetupLines(
2287 DRAW_CONTEXT *pDC,
2288 PA_STATE& pa,
2289 uint32_t workerId,
2290 simdvector prim[],
2291 simdscalar recipW[],
2292 uint32_t primMask,
2293 simdscalari primID,
2294 simdscalari viewportIdx)
2295 {
2296 SWR_CONTEXT *pContext = pDC->pContext;
2297
2298 AR_BEGIN(FEBinLines, pDC->drawId);
2299
2300 const API_STATE& state = GetApiState(pDC);
2301 const SWR_RASTSTATE& rastState = state.rastState;
2302 const SWR_GS_STATE& gsState = state.gsState;
2303
2304 // Select attribute processor
2305 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2306 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2307
2308 simdscalar& vRecipW0 = recipW[0];
2309 simdscalar& vRecipW1 = recipW[1];
2310
2311 // convert to fixed point
2312 simdscalari vXi[2], vYi[2];
2313 vXi[0] = fpToFixedPointVertical(prim[0].x);
2314 vYi[0] = fpToFixedPointVertical(prim[0].y);
2315 vXi[1] = fpToFixedPointVertical(prim[1].x);
2316 vYi[1] = fpToFixedPointVertical(prim[1].y);
2317
2318 // compute x-major vs y-major mask
2319 simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2320 simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2321 simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2322 uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2323
2324 // cull zero-length lines
2325 simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2326 vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2327
2328 primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2329
2330 uint32_t *pPrimID = (uint32_t *)&primID;
2331 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2332
2333 simdscalar vUnused = _simd_setzero_ps();
2334
2335 // Calc bounding box of lines
2336 simdBBox bbox;
2337 bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2338 bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2339 bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2340 bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2341
2342 // bloat bbox by line width along minor axis
2343 simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2344 simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2345 simdBBox bloatBox;
2346 bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2347 bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2348 bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2349 bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2350
2351 bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2352 bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2353 bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2354 bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2355
2356 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2357 simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2358 if (state.gsState.emitsViewportArrayIndex)
2359 {
2360 GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2361 scisXmin, scisYmin, scisXmax, scisYmax);
2362 }
2363 else // broadcast fast path for non-VPAI case.
2364 {
2365 scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2366 scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2367 scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2368 scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2369 }
2370
2371 bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2372 bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2373 bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2374 bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2375
2376 // Cull prims completely outside scissor
2377 {
2378 simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2379 simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2380 simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2381 uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2382 primMask = primMask & ~maskOutsideScissor;
2383 }
2384
2385 if (!primMask)
2386 {
2387 goto endBinLines;
2388 }
2389
2390 // Convert triangle bbox to macrotile units.
2391 bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2392 bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2393 bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2394 bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2395
2396 OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2397 _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2398 _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2399 _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2400 _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2401
2402 // transpose verts needed for backend
2403 /// @todo modify BE to take non-transformed verts
2404 __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2405 vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2406 vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2407 vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2408 vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2409
2410 // store render target array index
2411 OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2412 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2413 {
2414 simdvector vRtai[2];
2415 pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2416 simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2417 _simd_store_si((simdscalari*)aRTAI, vRtaii);
2418 }
2419 else
2420 {
2421 _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2422 }
2423
2424 // scan remaining valid prims and bin each separately
2425 DWORD primIndex;
2426 while (_BitScanForward(&primIndex, primMask))
2427 {
2428 uint32_t linkageCount = state.backendState.numAttributes;
2429 uint32_t numScalarAttribs = linkageCount * 4;
2430
2431 BE_WORK work;
2432 work.type = DRAW;
2433
2434 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2435
2436 desc.triFlags.frontFacing = 1;
2437 desc.triFlags.primID = pPrimID[primIndex];
2438 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2439 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2440 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2441
2442 work.pfnWork = RasterizeLine;
2443
2444 auto pArena = pDC->pArena;
2445 SWR_ASSERT(pArena != nullptr);
2446
2447 // store active attribs
2448 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2449 desc.numAttribs = linkageCount;
2450 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2451
2452 // store line vertex data
2453 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2454 _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2455 _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2456 _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2457 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2458
2459 // store user clip distances
2460 if (rastState.clipDistanceMask)
2461 {
2462 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2463 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2464 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2465 }
2466
2467 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2468 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2469 {
2470 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2471 {
2472 #if KNOB_ENABLE_TOSS_POINTS
2473 if (!KNOB_TOSS_SETUP_TRIS)
2474 #endif
2475 {
2476 pTileMgr->enqueue(x, y, &work);
2477 }
2478 }
2479 }
2480
2481 primMask &= ~(1 << primIndex);
2482 }
2483
2484 endBinLines:
2485
2486 AR_END(FEBinLines, 1);
2487 }
2488
2489 #if USE_SIMD16_FRONTEND
2490 void BinPostSetupLines_simd16(
2491 DRAW_CONTEXT *pDC,
2492 PA_STATE& pa,
2493 uint32_t workerId,
2494 simd16vector prim[],
2495 simd16scalar recipW[],
2496 uint32_t primMask,
2497 simd16scalari primID,
2498 simd16scalari viewportIdx)
2499 {
2500 SWR_CONTEXT *pContext = pDC->pContext;
2501
2502 AR_BEGIN(FEBinLines, pDC->drawId);
2503
2504 const API_STATE& state = GetApiState(pDC);
2505 const SWR_RASTSTATE& rastState = state.rastState;
2506 const SWR_GS_STATE& gsState = state.gsState;
2507
2508 // Select attribute processor
2509 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2510 state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2511
2512 simd16scalar& vRecipW0 = recipW[0];
2513 simd16scalar& vRecipW1 = recipW[1];
2514
2515 // convert to fixed point
2516 simd16scalari vXi[2], vYi[2];
2517
2518 vXi[0] = fpToFixedPointVertical(prim[0].x);
2519 vYi[0] = fpToFixedPointVertical(prim[0].y);
2520 vXi[1] = fpToFixedPointVertical(prim[1].x);
2521 vYi[1] = fpToFixedPointVertical(prim[1].y);
2522
2523 // compute x-major vs y-major mask
2524 simd16scalari xLength = _simd16_abs_epi32(_simd16_sub_epi32(vXi[0], vXi[1]));
2525 simd16scalari yLength = _simd16_abs_epi32(_simd16_sub_epi32(vYi[0], vYi[1]));
2526 simd16scalar vYmajorMask = _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength, xLength));
2527 uint32_t yMajorMask = _simd16_movemask_ps(vYmajorMask);
2528
2529 // cull zero-length lines
2530 simd16scalari vZeroLengthMask = _simd16_cmpeq_epi32(xLength, _simd16_setzero_si());
2531 vZeroLengthMask = _simd16_and_si(vZeroLengthMask, _simd16_cmpeq_epi32(yLength, _simd16_setzero_si()));
2532
2533 primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask));
2534
2535 uint32_t *pPrimID = (uint32_t *)&primID;
2536 const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2537
2538 // Calc bounding box of lines
2539 simd16BBox bbox;
2540 bbox.xmin = _simd16_min_epi32(vXi[0], vXi[1]);
2541 bbox.xmax = _simd16_max_epi32(vXi[0], vXi[1]);
2542 bbox.ymin = _simd16_min_epi32(vYi[0], vYi[1]);
2543 bbox.ymax = _simd16_max_epi32(vYi[0], vYi[1]);
2544
2545 // bloat bbox by line width along minor axis
2546 simd16scalar vHalfWidth = _simd16_set1_ps(rastState.lineWidth / 2.0f);
2547 simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2548
2549 simd16BBox bloatBox;
2550
2551 bloatBox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
2552 bloatBox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
2553 bloatBox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
2554 bloatBox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
2555
2556 bbox.xmin = _simd16_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2557 bbox.xmax = _simd16_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2558 bbox.ymin = _simd16_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2559 bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2560
2561 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2562 simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
2563
2564 if (state.gsState.emitsViewportArrayIndex)
2565 {
2566 GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2567 scisXmin, scisYmin, scisXmax, scisYmax);
2568 }
2569 else // broadcast fast path for non-VPAI case.
2570 {
2571 scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2572 scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2573 scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2574 scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2575 }
2576
2577 bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
2578 bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
2579 bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
2580 bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
2581
2582 // Cull prims completely outside scissor
2583 {
2584 simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
2585 simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
2586 simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
2587 uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
2588 primMask = primMask & ~maskOutsideScissor;
2589 }
2590
2591 const simdscalar unused = _simd_setzero_ps();
2592
2593 if (!primMask)
2594 {
2595 goto endBinLines;
2596 }
2597
2598 // Convert triangle bbox to macrotile units.
2599 bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2600 bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2601 bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2602 bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2603
2604 OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
2605
2606 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
2607 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
2608 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
2609 _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
2610
2611 // transpose verts needed for backend
2612 /// @todo modify BE to take non-transformed verts
2613 __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2614 __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2615 __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2616 __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
2617
2618 vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
2619 vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
2620 vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused);
2621 vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), unused);
2622
2623 vTranspose3x8(vHorizX[1], _simd16_extract_ps(prim[0].x, 1), _simd16_extract_ps(prim[1].x, 1), unused);
2624 vTranspose3x8(vHorizY[1], _simd16_extract_ps(prim[0].y, 1), _simd16_extract_ps(prim[1].y, 1), unused);
2625 vTranspose3x8(vHorizZ[1], _simd16_extract_ps(prim[0].z, 1), _simd16_extract_ps(prim[1].z, 1), unused);
2626 vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), unused);
2627
2628 // store render target array index
2629 OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
2630 if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2631 {
2632 simd16vector vRtai[2];
2633 pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai);
2634 simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x);
2635 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
2636 }
2637 else
2638 {
2639 _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
2640 }
2641
2642 // scan remaining valid prims and bin each separately
2643 DWORD primIndex;
2644 while (_BitScanForward(&primIndex, primMask))
2645 {
2646 uint32_t linkageCount = state.backendState.numAttributes;
2647 uint32_t numScalarAttribs = linkageCount * 4;
2648
2649 BE_WORK work;
2650 work.type = DRAW;
2651
2652 TRIANGLE_WORK_DESC &desc = work.desc.tri;
2653
2654 desc.triFlags.frontFacing = 1;
2655 desc.triFlags.primID = pPrimID[primIndex];
2656 desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2657 desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2658 desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2659
2660 work.pfnWork = RasterizeLine;
2661
2662 auto pArena = pDC->pArena;
2663 SWR_ASSERT(pArena != nullptr);
2664
2665 // store active attribs
2666 desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2667 desc.numAttribs = linkageCount;
2668 pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2669
2670 // store line vertex data
2671 desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2672
2673 {
2674 const uint32_t i = primIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
2675 const uint32_t j = primIndex & 7; // triIndex % KNOB_SIMD_WIDTH
2676
2677 _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
2678 _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
2679 _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
2680 _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
2681 }
2682
2683 // store user clip distances
2684 if (rastState.clipDistanceMask)
2685 {
2686 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2687 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2688 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
2689 }
2690
2691 MacroTileMgr *pTileMgr = pDC->pTileMgr;
2692 for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2693 {
2694 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2695 {
2696 #if KNOB_ENABLE_TOSS_POINTS
2697 if (!KNOB_TOSS_SETUP_TRIS)
2698 #endif
2699 {
2700 pTileMgr->enqueue(x, y, &work);
2701 }
2702 }
2703 }
2704
2705 primMask &= ~(1 << primIndex);
2706 }
2707
2708 endBinLines:
2709
2710 AR_END(FEBinLines, 1);
2711 }
2712
2713 #endif
2714 //////////////////////////////////////////////////////////////////////////
2715 /// @brief Bin SIMD lines to the backend.
2716 /// @param pDC - pointer to draw context.
2717 /// @param pa - The primitive assembly object.
2718 /// @param workerId - thread's worker id. Even thread has a unique id.
2719 /// @param tri - Contains line position data for SIMDs worth of points.
2720 /// @param primID - Primitive ID for each line.
2721 /// @param viewportIdx - Viewport Array Index for each line.
2722 void BinLines(
2723 DRAW_CONTEXT *pDC,
2724 PA_STATE& pa,
2725 uint32_t workerId,
2726 simdvector prim[],
2727 uint32_t primMask,
2728 simdscalari primID,
2729 simdscalari viewportIdx)
2730 {
2731 const API_STATE& state = GetApiState(pDC);
2732 const SWR_RASTSTATE& rastState = state.rastState;
2733 const SWR_FRONTEND_STATE& feState = state.frontendState;
2734
2735 simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
2736
2737 if (!feState.vpTransformDisable)
2738 {
2739 // perspective divide
2740 vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2741 vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2742
2743 prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
2744 prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
2745
2746 prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
2747 prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
2748
2749 prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
2750 prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
2751
2752 // viewport transform to screen coords
2753 if (state.gsState.emitsViewportArrayIndex)
2754 {
2755 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2756 }
2757 else
2758 {
2759 viewportTransform<2>(prim, state.vpMatrices);
2760 }
2761 }
2762
2763 // adjust for pixel center location
2764 simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2765 prim[0].x = _simd_add_ps(prim[0].x, offset);
2766 prim[0].y = _simd_add_ps(prim[0].y, offset);
2767
2768 prim[1].x = _simd_add_ps(prim[1].x, offset);
2769 prim[1].y = _simd_add_ps(prim[1].y, offset);
2770
2771 BinPostSetupLines(
2772 pDC,
2773 pa,
2774 workerId,
2775 prim,
2776 vRecipW,
2777 primMask,
2778 primID,
2779 viewportIdx);
2780 }
2781
2782 #if USE_SIMD16_FRONTEND
2783 void SIMDAPI BinLines_simd16(
2784 DRAW_CONTEXT *pDC,
2785 PA_STATE& pa,
2786 uint32_t workerId,
2787 simd16vector prim[3],
2788 uint32_t primMask,
2789 simd16scalari primID,
2790 simd16scalari viewportIdx)
2791 {
2792 const API_STATE& state = GetApiState(pDC);
2793 const SWR_RASTSTATE& rastState = state.rastState;
2794 const SWR_FRONTEND_STATE& feState = state.frontendState;
2795
2796 simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) };
2797
2798 if (!feState.vpTransformDisable)
2799 {
2800 // perspective divide
2801 vRecipW[0] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[0].w);
2802 vRecipW[1] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[1].w);
2803
2804 prim[0].v[0] = _simd16_mul_ps(prim[0].v[0], vRecipW[0]);
2805 prim[1].v[0] = _simd16_mul_ps(prim[1].v[0], vRecipW[1]);
2806
2807 prim[0].v[1] = _simd16_mul_ps(prim[0].v[1], vRecipW[0]);
2808 prim[1].v[1] = _simd16_mul_ps(prim[1].v[1], vRecipW[1]);
2809
2810 prim[0].v[2] = _simd16_mul_ps(prim[0].v[2], vRecipW[0]);
2811 prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]);
2812
2813 // viewport transform to screen coords
2814 if (state.gsState.emitsViewportArrayIndex)
2815 {
2816 viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2817 }
2818 else
2819 {
2820 viewportTransform<2>(prim, state.vpMatrices);
2821 }
2822 }
2823
2824 // adjust for pixel center location
2825 simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
2826
2827 prim[0].x = _simd16_add_ps(prim[0].x, offset);
2828 prim[0].y = _simd16_add_ps(prim[0].y, offset);
2829
2830 prim[1].x = _simd16_add_ps(prim[1].x, offset);
2831 prim[1].y = _simd16_add_ps(prim[1].y, offset);
2832
2833 BinPostSetupLines_simd16(
2834 pDC,
2835 pa,
2836 workerId,
2837 prim,
2838 vRecipW,
2839 primMask,
2840 primID,
2841 viewportIdx);
2842 }
2843
2844 #endif