1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
31 #include <type_traits>
33 // Calculates the A and B coefficients for the 3 edges of the triangle
35 // maths for edge equations:
36 // standard form of a line in 2d
42 void triangleSetupAB(const __m128 vX
, const __m128 vY
, __m128
& vA
, __m128
& vB
)
44 // vYsub = y1 y2 y0 dc
45 __m128 vYsub
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(3, 0, 2, 1));
47 vA
= _mm_sub_ps(vY
, vYsub
);
54 // vXsub = x1 x2 x0 dc
55 __m128 vXsub
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(3, 0, 2, 1));
57 vB
= _mm_sub_ps(vXsub
, vX
);
66 void triangleSetupABInt(const __m128i vX
, const __m128i vY
, __m128i
& vA
, __m128i
& vB
)
68 // generate edge equations
72 __m128i vYsub
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 2, 1));
73 vA
= _mm_sub_epi32(vY
, vYsub
);
75 __m128i vXsub
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 2, 1));
76 vB
= _mm_sub_epi32(vXsub
, vX
);
80 void triangleSetupABIntVertical(const simdscalari vX
[3], const simdscalari vY
[3], simdscalari (&vA
)[3], simdscalari (&vB
)[3])
84 vA
[0] = _simd_sub_epi32(vY
[0], vY
[1]);
85 vA
[1] = _simd_sub_epi32(vY
[1], vY
[2]);
86 vA
[2] = _simd_sub_epi32(vY
[2], vY
[0]);
88 vB
[0] = _simd_sub_epi32(vX
[1], vX
[0]);
89 vB
[1] = _simd_sub_epi32(vX
[2], vX
[1]);
90 vB
[2] = _simd_sub_epi32(vX
[0], vX
[2]);
93 #if ENABLE_AVX512_SIMD16
95 void triangleSetupABIntVertical(const simd16scalari vX
[3], const simd16scalari vY
[3], simd16scalari(&vA
)[3], simd16scalari(&vB
)[3])
99 vA
[0] = _simd16_sub_epi32(vY
[0], vY
[1]);
100 vA
[1] = _simd16_sub_epi32(vY
[1], vY
[2]);
101 vA
[2] = _simd16_sub_epi32(vY
[2], vY
[0]);
103 vB
[0] = _simd16_sub_epi32(vX
[1], vX
[0]);
104 vB
[1] = _simd16_sub_epi32(vX
[2], vX
[1]);
105 vB
[2] = _simd16_sub_epi32(vX
[0], vX
[2]);
109 // Calculate the determinant of the triangle
110 // 2 vectors between the 3 points: P, Q
111 // Px = x0-x2, Py = y0-y2
112 // Qx = x1-x2, Qy = y1-y2
114 // det = | | = PxQy - PyQx
116 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
117 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
118 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
119 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
120 // : B[2]*A[1] - A[2]*B[1]
122 float calcDeterminantInt(const __m128i vA
, const __m128i vB
)
124 // vAShuf = [A1, A0, A2, A0]
125 __m128i vAShuf
= _mm_shuffle_epi32(vA
, _MM_SHUFFLE(0, 2, 0, 1));
126 // vBShuf = [B2, B0, B1, B0]
127 __m128i vBShuf
= _mm_shuffle_epi32(vB
, _MM_SHUFFLE(0, 1, 0, 2));
128 // vMul = [A1*B2, B1*A2]
129 __m128i vMul
= _mm_mul_epi32(vAShuf
, vBShuf
);
131 // shuffle upper to lower
132 // vMul2 = [B1*A2, B1*A2]
133 __m128i vMul2
= _mm_shuffle_epi32(vMul
, _MM_SHUFFLE(3, 2, 3, 2));
134 //vMul = [A1*B2 - B1*A2]
135 vMul
= _mm_sub_epi64(vMul
, vMul2
);
138 _mm_store_sd((double*)&result
, _mm_castsi128_pd(vMul
));
140 double dResult
= (double)result
;
141 dResult
= dResult
* (1.0 / FIXED_POINT16_SCALE
);
143 return (float)dResult
;
147 void calcDeterminantIntVertical(const simdscalari vA
[3], const simdscalari vB
[3], simdscalari
*pvDet
)
149 // refer to calcDeterminantInt comment for calculation explanation
152 simdscalari vA1Lo
= _simd_unpacklo_epi32(vA
[1], vA
[1]); // 0 0 1 1 4 4 5 5
153 simdscalari vA1Hi
= _simd_unpackhi_epi32(vA
[1], vA
[1]); // 2 2 3 3 6 6 7 7
155 simdscalari vB2Lo
= _simd_unpacklo_epi32(vB
[2], vB
[2]);
156 simdscalari vB2Hi
= _simd_unpackhi_epi32(vB
[2], vB
[2]);
158 simdscalari vA1B2Lo
= _simd_mul_epi32(vA1Lo
, vB2Lo
); // 0 1 4 5
159 simdscalari vA1B2Hi
= _simd_mul_epi32(vA1Hi
, vB2Hi
); // 2 3 6 7
162 simdscalari vA2Lo
= _simd_unpacklo_epi32(vA
[2], vA
[2]);
163 simdscalari vA2Hi
= _simd_unpackhi_epi32(vA
[2], vA
[2]);
165 simdscalari vB1Lo
= _simd_unpacklo_epi32(vB
[1], vB
[1]);
166 simdscalari vB1Hi
= _simd_unpackhi_epi32(vB
[1], vB
[1]);
168 simdscalari vA2B1Lo
= _simd_mul_epi32(vA2Lo
, vB1Lo
);
169 simdscalari vA2B1Hi
= _simd_mul_epi32(vA2Hi
, vB1Hi
);
172 simdscalari detLo
= _simd_sub_epi64(vA1B2Lo
, vA2B1Lo
);
173 simdscalari detHi
= _simd_sub_epi64(vA1B2Hi
, vA2B1Hi
);
175 // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
176 simdscalari vResultLo
= _simd_permute2f128_si(detLo
, detHi
, 0x20);
178 // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
179 simdscalari vResultHi
= _simd_permute2f128_si(detLo
, detHi
, 0x31);
181 pvDet
[0] = vResultLo
;
182 pvDet
[1] = vResultHi
;
185 #if ENABLE_AVX512_SIMD16
187 void calcDeterminantIntVertical(const simd16scalari vA
[3], const simd16scalari vB
[3], simd16scalari
*pvDet
)
189 // refer to calcDeterminantInt comment for calculation explanation
192 simd16scalari vA1_lo
= _simd16_unpacklo_epi32(vA
[1], vA
[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
193 simd16scalari vA1_hi
= _simd16_unpackhi_epi32(vA
[1], vA
[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
195 simd16scalari vB2_lo
= _simd16_unpacklo_epi32(vB
[2], vB
[2]);
196 simd16scalari vB2_hi
= _simd16_unpackhi_epi32(vB
[2], vB
[2]);
198 simd16scalari vA1B2_lo
= _simd16_mul_epi32(vA1_lo
, vB2_lo
); // 0 1 4 5 8 9 C D (64b)
199 simd16scalari vA1B2_hi
= _simd16_mul_epi32(vA1_hi
, vB2_hi
); // 2 3 6 7 A B E F
202 simd16scalari vA2_lo
= _simd16_unpacklo_epi32(vA
[2], vA
[2]);
203 simd16scalari vA2_hi
= _simd16_unpackhi_epi32(vA
[2], vA
[2]);
205 simd16scalari vB1_lo
= _simd16_unpacklo_epi32(vB
[1], vB
[1]);
206 simd16scalari vB1_hi
= _simd16_unpackhi_epi32(vB
[1], vB
[1]);
208 simd16scalari vA2B1_lo
= _simd16_mul_epi32(vA2_lo
, vB1_lo
);
209 simd16scalari vA2B1_hi
= _simd16_mul_epi32(vA2_hi
, vB1_hi
);
212 simd16scalari difflo
= _simd16_sub_epi64(vA1B2_lo
, vA2B1_lo
); // 0 1 4 5 8 9 C D (64b)
213 simd16scalari diffhi
= _simd16_sub_epi64(vA1B2_hi
, vA2B1_hi
); // 2 3 6 7 A B E F
215 // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
216 simd16scalari templo
= _simd16_permute2f128_si(difflo
, diffhi
, 0x44); // 0 1 4 5 2 3 6 7 (64b)
217 simd16scalari temphi
= _simd16_permute2f128_si(difflo
, diffhi
, 0xEE); // 8 9 C D A B E F
219 // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
220 pvDet
[0] = _simd16_permute2f128_si(templo
, templo
, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
221 pvDet
[1] = _simd16_permute2f128_si(temphi
, temphi
, 0xD8); // 8 9 A B C D E F
226 void triangleSetupC(const __m128 vX
, const __m128 vY
, const __m128 vA
, const __m128
&vB
, __m128
&vC
)
229 vC
= _mm_mul_ps(vA
, vX
);
230 __m128 vCy
= _mm_mul_ps(vB
, vY
);
231 vC
= _mm_mul_ps(vC
, _mm_set1_ps(-1.0f
));
232 vC
= _mm_sub_ps(vC
, vCy
);
235 template<uint32_t NumVerts
>
237 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
239 simdscalar m00
= _simd_load1_ps(&vpMatrices
.m00
[0]);
240 simdscalar m30
= _simd_load1_ps(&vpMatrices
.m30
[0]);
241 simdscalar m11
= _simd_load1_ps(&vpMatrices
.m11
[0]);
242 simdscalar m31
= _simd_load1_ps(&vpMatrices
.m31
[0]);
243 simdscalar m22
= _simd_load1_ps(&vpMatrices
.m22
[0]);
244 simdscalar m32
= _simd_load1_ps(&vpMatrices
.m32
[0]);
246 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
248 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
249 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
250 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
254 #if USE_SIMD16_FRONTEND
255 template<uint32_t NumVerts
>
257 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
259 const simd16scalar m00
= _simd16_broadcast_ss(&vpMatrices
.m00
[0]);
260 const simd16scalar m30
= _simd16_broadcast_ss(&vpMatrices
.m30
[0]);
261 const simd16scalar m11
= _simd16_broadcast_ss(&vpMatrices
.m11
[0]);
262 const simd16scalar m31
= _simd16_broadcast_ss(&vpMatrices
.m31
[0]);
263 const simd16scalar m22
= _simd16_broadcast_ss(&vpMatrices
.m22
[0]);
264 const simd16scalar m32
= _simd16_broadcast_ss(&vpMatrices
.m32
[0]);
266 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
268 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
269 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
270 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
275 template<uint32_t NumVerts
>
277 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simdscalari vViewportIdx
)
279 // perform a gather of each matrix element based on the viewport array indexes
280 simdscalar m00
= _simd_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
281 simdscalar m30
= _simd_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
282 simdscalar m11
= _simd_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
283 simdscalar m31
= _simd_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
284 simdscalar m22
= _simd_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
285 simdscalar m32
= _simd_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
287 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
289 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
290 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
291 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
295 #if USE_SIMD16_FRONTEND
296 template<uint32_t NumVerts
>
298 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simd16scalari vViewportIdx
)
300 // perform a gather of each matrix element based on the viewport array indexes
301 const simd16scalar m00
= _simd16_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
302 const simd16scalar m30
= _simd16_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
303 const simd16scalar m11
= _simd16_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
304 const simd16scalar m31
= _simd16_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
305 const simd16scalar m22
= _simd16_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
306 const simd16scalar m32
= _simd16_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
308 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
310 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
311 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
312 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
318 void calcBoundingBoxInt(const __m128i
&vX
, const __m128i
&vY
, SWR_RECT
&bbox
)
320 // Need horizontal fp min here
321 __m128i vX1
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 2, 0, 1));
322 __m128i vX2
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 1, 2));
324 __m128i vY1
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 2, 0, 1));
325 __m128i vY2
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 1, 2));
328 __m128i vMinX
= _mm_min_epi32(vX
, vX1
);
329 vMinX
= _mm_min_epi32(vMinX
, vX2
);
331 __m128i vMaxX
= _mm_max_epi32(vX
, vX1
);
332 vMaxX
= _mm_max_epi32(vMaxX
, vX2
);
334 __m128i vMinY
= _mm_min_epi32(vY
, vY1
);
335 vMinY
= _mm_min_epi32(vMinY
, vY2
);
337 __m128i vMaxY
= _mm_max_epi32(vY
, vY1
);
338 vMaxY
= _mm_max_epi32(vMaxY
, vY2
);
340 bbox
.xmin
= _mm_extract_epi32(vMinX
, 0);
341 bbox
.xmax
= _mm_extract_epi32(vMaxX
, 0);
342 bbox
.ymin
= _mm_extract_epi32(vMinY
, 0);
343 bbox
.ymax
= _mm_extract_epi32(vMaxY
, 0);
347 bool CanUseSimplePoints(DRAW_CONTEXT
*pDC
)
349 const API_STATE
& state
= GetApiState(pDC
);
351 return (state
.rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&&
352 state
.rastState
.pointSize
== 1.0f
&&
353 !state
.rastState
.pointParam
&&
354 !state
.rastState
.pointSpriteEnable
);
358 bool vHasNaN(const __m128
& vec
)
360 const __m128 result
= _mm_cmpunord_ps(vec
, vec
);
361 const int32_t mask
= _mm_movemask_ps(result
);
365 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numElements
);
366 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
);
369 // ProcessDraw front-end function. All combinations of parameter values are available
370 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
372 bool IsCutIndexEnabled
,
373 bool HasTessellation
,
374 bool HasGeometryShader
,
376 bool HasRasterization
);
378 void ProcessClear(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
379 void ProcessStoreTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
380 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
381 void ProcessSync(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
382 void ProcessShutdown(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
384 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
);
385 #if USE_SIMD16_FRONTEND
386 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
);
389 struct PA_STATE_BASE
; // forward decl
390 void BinPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
391 void BinLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
392 #if USE_SIMD16_FRONTEND
393 void SIMDAPI
BinPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
394 void SIMDAPI
BinLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);