1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
31 #include <type_traits>
33 #if ENABLE_AVX512_SIMD16
34 // TODO: this belongs in state.h alongside the simdvector definition, but there is a llvm codegen issue
37 simd16vector attrib
[SWR_VTX_NUM_SLOTS
];
41 // Calculates the A and B coefficients for the 3 edges of the triangle
43 // maths for edge equations:
44 // standard form of a line in 2d
50 void triangleSetupAB(const __m128 vX
, const __m128 vY
, __m128
& vA
, __m128
& vB
)
52 // vYsub = y1 y2 y0 dc
53 __m128 vYsub
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(3, 0, 2, 1));
55 vA
= _mm_sub_ps(vY
, vYsub
);
62 // vXsub = x1 x2 x0 dc
63 __m128 vXsub
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(3, 0, 2, 1));
65 vB
= _mm_sub_ps(vXsub
, vX
);
74 void triangleSetupABVertical(const simdscalar vX
[3], const simdscalar vY
[3], simdscalar (&vA
)[3], simdscalar (&vB
)[3])
76 // generate edge equations
79 vA
[0] = _simd_sub_ps(vY
[0], vY
[1]);
80 vA
[1] = _simd_sub_ps(vY
[1], vY
[2]);
81 vA
[2] = _simd_sub_ps(vY
[2], vY
[0]);
83 vB
[0] = _simd_sub_ps(vX
[1], vX
[0]);
84 vB
[1] = _simd_sub_ps(vX
[2], vX
[1]);
85 vB
[2] = _simd_sub_ps(vX
[0], vX
[2]);
89 void triangleSetupABInt(const __m128i vX
, const __m128i vY
, __m128i
& vA
, __m128i
& vB
)
91 // generate edge equations
95 __m128i vYsub
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 2, 1));
96 vA
= _mm_sub_epi32(vY
, vYsub
);
98 __m128i vXsub
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 2, 1));
99 vB
= _mm_sub_epi32(vXsub
, vX
);
103 void triangleSetupABIntVertical(const simdscalari vX
[3], const simdscalari vY
[3], simdscalari (&vA
)[3], simdscalari (&vB
)[3])
107 vA
[0] = _simd_sub_epi32(vY
[0], vY
[1]);
108 vA
[1] = _simd_sub_epi32(vY
[1], vY
[2]);
109 vA
[2] = _simd_sub_epi32(vY
[2], vY
[0]);
111 vB
[0] = _simd_sub_epi32(vX
[1], vX
[0]);
112 vB
[1] = _simd_sub_epi32(vX
[2], vX
[1]);
113 vB
[2] = _simd_sub_epi32(vX
[0], vX
[2]);
116 #if ENABLE_AVX512_SIMD16
118 void triangleSetupABIntVertical(const simd16scalari vX
[3], const simd16scalari vY
[3], simd16scalari(&vA
)[3], simd16scalari(&vB
)[3])
122 vA
[0] = _simd16_sub_epi32(vY
[0], vY
[1]);
123 vA
[1] = _simd16_sub_epi32(vY
[1], vY
[2]);
124 vA
[2] = _simd16_sub_epi32(vY
[2], vY
[0]);
126 vB
[0] = _simd16_sub_epi32(vX
[1], vX
[0]);
127 vB
[1] = _simd16_sub_epi32(vX
[2], vX
[1]);
128 vB
[2] = _simd16_sub_epi32(vX
[0], vX
[2]);
132 // Calculate the determinant of the triangle
133 // 2 vectors between the 3 points: P, Q
134 // Px = x0-x2, Py = y0-y2
135 // Qx = x1-x2, Qy = y1-y2
137 // det = | | = PxQy - PyQx
139 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
140 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
141 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
142 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
143 // : B[2]*A[1] - A[2]*B[1]
145 float calcDeterminantInt(const __m128i vA
, const __m128i vB
)
147 // vAShuf = [A1, A0, A2, A0]
148 __m128i vAShuf
= _mm_shuffle_epi32(vA
, _MM_SHUFFLE(0, 2, 0, 1));
149 // vBShuf = [B2, B0, B1, B0]
150 __m128i vBShuf
= _mm_shuffle_epi32(vB
, _MM_SHUFFLE(0, 1, 0, 2));
151 // vMul = [A1*B2, B1*A2]
152 __m128i vMul
= _mm_mul_epi32(vAShuf
, vBShuf
);
154 // shuffle upper to lower
155 // vMul2 = [B1*A2, B1*A2]
156 __m128i vMul2
= _mm_shuffle_epi32(vMul
, _MM_SHUFFLE(3, 2, 3, 2));
157 //vMul = [A1*B2 - B1*A2]
158 vMul
= _mm_sub_epi64(vMul
, vMul2
);
161 _mm_store_sd((double*)&result
, _mm_castsi128_pd(vMul
));
163 double dResult
= (double)result
;
164 dResult
= dResult
* (1.0 / FIXED_POINT16_SCALE
);
166 return (float)dResult
;
170 void calcDeterminantIntVertical(const simdscalari vA
[3], const simdscalari vB
[3], simdscalari
*pvDet
)
172 // refer to calcDeterminantInt comment for calculation explanation
174 simdscalari vA1Lo
= _simd_unpacklo_epi32(vA
[1], vA
[1]); // 0 0 1 1 4 4 5 5
175 simdscalari vA1Hi
= _simd_unpackhi_epi32(vA
[1], vA
[1]); // 2 2 3 3 6 6 7 7
177 simdscalari vB2Lo
= _simd_unpacklo_epi32(vB
[2], vB
[2]);
178 simdscalari vB2Hi
= _simd_unpackhi_epi32(vB
[2], vB
[2]);
180 simdscalari vA1B2Lo
= _simd_mul_epi32(vA1Lo
, vB2Lo
); // 0 1 4 5
181 simdscalari vA1B2Hi
= _simd_mul_epi32(vA1Hi
, vB2Hi
); // 2 3 6 7
184 simdscalari vA2Lo
= _simd_unpacklo_epi32(vA
[2], vA
[2]);
185 simdscalari vA2Hi
= _simd_unpackhi_epi32(vA
[2], vA
[2]);
187 simdscalari vB1Lo
= _simd_unpacklo_epi32(vB
[1], vB
[1]);
188 simdscalari vB1Hi
= _simd_unpackhi_epi32(vB
[1], vB
[1]);
190 simdscalari vA2B1Lo
= _simd_mul_epi32(vA2Lo
, vB1Lo
);
191 simdscalari vA2B1Hi
= _simd_mul_epi32(vA2Hi
, vB1Hi
);
194 simdscalari detLo
= _simd_sub_epi64(vA1B2Lo
, vA2B1Lo
);
195 simdscalari detHi
= _simd_sub_epi64(vA1B2Hi
, vA2B1Hi
);
197 // shuffle 0 1 4 5 -> 0 1 2 3
198 simdscalari vResultLo
= _simd_permute2f128_si(detLo
, detHi
, 0x20);
199 simdscalari vResultHi
= _simd_permute2f128_si(detLo
, detHi
, 0x31);
201 pvDet
[0] = vResultLo
;
202 pvDet
[1] = vResultHi
;
205 #if ENABLE_AVX512_SIMD16
207 void calcDeterminantIntVertical(const simd16scalari vA
[3], const simd16scalari vB
[3], simd16scalari
*pvDet
)
209 // refer to calcDeterminantInt comment for calculation explanation
213 // TODO: get the native SIMD16 version working..
215 simdscalari vA_lo
[3];
216 simdscalari vA_hi
[3];
217 simdscalari vB_lo
[3];
218 simdscalari vB_hi
[3];
220 for (uint32_t i
= 0; i
< 3; i
+= 1)
222 vA_lo
[i
] = _simd16_extract_si(vA
[i
], 0);
223 vA_hi
[i
] = _simd16_extract_si(vA
[i
], 1);
224 vB_lo
[i
] = _simd16_extract_si(vB
[i
], 0);
225 vB_hi
[i
] = _simd16_extract_si(vB
[i
], 1);
228 calcDeterminantIntVertical(vA_lo
, vB_lo
, reinterpret_cast<simdscalari
*>(&pvDet
[0]));
229 calcDeterminantIntVertical(vA_hi
, vB_hi
, reinterpret_cast<simdscalari
*>(&pvDet
[1]));
231 simd16scalari vA1Lo
= _simd16_unpacklo_epi32(vA
[1], vA
[1]); // 0 0 1 1 4 4 5 5 8 8 9 9 C C D D
232 simd16scalari vA1Hi
= _simd16_unpackhi_epi32(vA
[1], vA
[1]); // 2 2 3 3 6 6 7 7 A A B B E E F F
234 simd16scalari vB2Lo
= _simd16_unpacklo_epi32(vB
[2], vB
[2]);
235 simd16scalari vB2Hi
= _simd16_unpackhi_epi32(vB
[2], vB
[2]);
237 simd16scalari vA1B2Lo
= _simd16_mul_epi32(vA1Lo
, vB2Lo
); // 0 1 4 5 8 9 C D
238 simd16scalari vA1B2Hi
= _simd16_mul_epi32(vA1Hi
, vB2Hi
); // 2 3 6 7 A B E F
241 simd16scalari vA2Lo
= _simd16_unpacklo_epi32(vA
[2], vA
[2]);
242 simd16scalari vA2Hi
= _simd16_unpackhi_epi32(vA
[2], vA
[2]);
244 simd16scalari vB1Lo
= _simd16_unpacklo_epi32(vB
[1], vB
[1]);
245 simd16scalari vB1Hi
= _simd16_unpackhi_epi32(vB
[1], vB
[1]);
247 simd16scalari vA2B1Lo
= _simd16_mul_epi32(vA2Lo
, vB1Lo
);
248 simd16scalari vA2B1Hi
= _simd16_mul_epi32(vA2Hi
, vB1Hi
);
251 simd16scalari detLo
= _simd16_sub_epi64(vA1B2Lo
, vA2B1Lo
);
252 simd16scalari detHi
= _simd16_sub_epi64(vA1B2Hi
, vA2B1Hi
);
254 // shuffle 0 1 4 5 -> 0 1 2 3
255 simd16scalari vResultLo
= _simd16_permute2f128_si(detLo
, detHi
, 0x20);
256 simd16scalari vResultHi
= _simd16_permute2f128_si(detLo
, detHi
, 0x31);
258 pvDet
[0] = vResultLo
;
259 pvDet
[1] = vResultHi
;
265 void triangleSetupC(const __m128 vX
, const __m128 vY
, const __m128 vA
, const __m128
&vB
, __m128
&vC
)
268 vC
= _mm_mul_ps(vA
, vX
);
269 __m128 vCy
= _mm_mul_ps(vB
, vY
);
270 vC
= _mm_mul_ps(vC
, _mm_set1_ps(-1.0f
));
271 vC
= _mm_sub_ps(vC
, vCy
);
275 void viewportTransform(__m128
&vX
, __m128
&vY
, __m128
&vZ
, const SWR_VIEWPORT_MATRIX
&vpMatrix
)
277 vX
= _mm_mul_ps(vX
, _mm_set1_ps(vpMatrix
.m00
));
278 vX
= _mm_add_ps(vX
, _mm_set1_ps(vpMatrix
.m30
));
280 vY
= _mm_mul_ps(vY
, _mm_set1_ps(vpMatrix
.m11
));
281 vY
= _mm_add_ps(vY
, _mm_set1_ps(vpMatrix
.m31
));
283 vZ
= _mm_mul_ps(vZ
, _mm_set1_ps(vpMatrix
.m22
));
284 vZ
= _mm_add_ps(vZ
, _mm_set1_ps(vpMatrix
.m32
));
287 template<uint32_t NumVerts
>
289 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
291 simdscalar m00
= _simd_load1_ps(&vpMatrices
.m00
[0]);
292 simdscalar m30
= _simd_load1_ps(&vpMatrices
.m30
[0]);
293 simdscalar m11
= _simd_load1_ps(&vpMatrices
.m11
[0]);
294 simdscalar m31
= _simd_load1_ps(&vpMatrices
.m31
[0]);
295 simdscalar m22
= _simd_load1_ps(&vpMatrices
.m22
[0]);
296 simdscalar m32
= _simd_load1_ps(&vpMatrices
.m32
[0]);
298 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
300 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
301 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
302 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
306 #if USE_SIMD16_FRONTEND
307 template<uint32_t NumVerts
>
309 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
311 const simd16scalar m00
= _simd16_broadcast_ss(&vpMatrices
.m00
[0]);
312 const simd16scalar m30
= _simd16_broadcast_ss(&vpMatrices
.m30
[0]);
313 const simd16scalar m11
= _simd16_broadcast_ss(&vpMatrices
.m11
[0]);
314 const simd16scalar m31
= _simd16_broadcast_ss(&vpMatrices
.m31
[0]);
315 const simd16scalar m22
= _simd16_broadcast_ss(&vpMatrices
.m22
[0]);
316 const simd16scalar m32
= _simd16_broadcast_ss(&vpMatrices
.m32
[0]);
318 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
320 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
321 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
322 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
327 template<uint32_t NumVerts
>
329 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simdscalari vViewportIdx
)
331 // perform a gather of each matrix element based on the viewport array indexes
332 simdscalar m00
= _simd_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
333 simdscalar m30
= _simd_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
334 simdscalar m11
= _simd_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
335 simdscalar m31
= _simd_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
336 simdscalar m22
= _simd_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
337 simdscalar m32
= _simd_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
339 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
341 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
342 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
343 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
347 #if USE_SIMD16_FRONTEND
348 template<uint32_t NumVerts
>
350 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simd16scalari vViewportIdx
)
352 // perform a gather of each matrix element based on the viewport array indexes
353 const simd16scalar m00
= _simd16_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
354 const simd16scalar m30
= _simd16_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
355 const simd16scalar m11
= _simd16_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
356 const simd16scalar m31
= _simd16_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
357 const simd16scalar m22
= _simd16_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
358 const simd16scalar m32
= _simd16_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
360 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
362 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
363 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
364 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
370 void calcBoundingBoxInt(const __m128i
&vX
, const __m128i
&vY
, SWR_RECT
&bbox
)
372 // Need horizontal fp min here
373 __m128i vX1
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 2, 0, 1));
374 __m128i vX2
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 1, 2));
376 __m128i vY1
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 2, 0, 1));
377 __m128i vY2
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 1, 2));
380 __m128i vMinX
= _mm_min_epi32(vX
, vX1
);
381 vMinX
= _mm_min_epi32(vMinX
, vX2
);
383 __m128i vMaxX
= _mm_max_epi32(vX
, vX1
);
384 vMaxX
= _mm_max_epi32(vMaxX
, vX2
);
386 __m128i vMinY
= _mm_min_epi32(vY
, vY1
);
387 vMinY
= _mm_min_epi32(vMinY
, vY2
);
389 __m128i vMaxY
= _mm_max_epi32(vY
, vY1
);
390 vMaxY
= _mm_max_epi32(vMaxY
, vY2
);
392 bbox
.xmin
= _mm_extract_epi32(vMinX
, 0);
393 bbox
.xmax
= _mm_extract_epi32(vMaxX
, 0);
394 bbox
.ymin
= _mm_extract_epi32(vMinY
, 0);
395 bbox
.ymax
= _mm_extract_epi32(vMaxY
, 0);
399 bool CanUseSimplePoints(DRAW_CONTEXT
*pDC
)
401 const API_STATE
& state
= GetApiState(pDC
);
403 return (state
.rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&&
404 state
.rastState
.pointSize
== 1.0f
&&
405 !state
.rastState
.pointParam
&&
406 !state
.rastState
.pointSpriteEnable
);
410 bool vHasNaN(const __m128
& vec
)
412 const __m128 result
= _mm_cmpunord_ps(vec
, vec
);
413 const int32_t mask
= _mm_movemask_ps(result
);
417 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numElements
);
418 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
);
421 // ProcessDraw front-end function. All combinations of parameter values are available
422 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
424 bool IsCutIndexEnabled
,
425 bool HasTessellation
,
426 bool HasGeometryShader
,
428 bool HasRasterization
);
430 void ProcessClear(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
431 void ProcessStoreTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
432 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
433 void ProcessSync(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
434 void ProcessShutdown(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
436 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
);
437 #if USE_SIMD16_FRONTEND
438 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
);
441 struct PA_STATE_BASE
; // forward decl
442 void BinPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
443 void BinLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
444 #if USE_SIMD16_FRONTEND
445 void SIMDAPI
BinPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
446 void SIMDAPI
BinLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);