1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
31 #include <type_traits>
33 #if ENABLE_AVX512_SIMD16
34 // TODO: this belongs in state.h alongside the simdvector definition, but there is a llvm codegen issue
37 simd16vector attrib
[KNOB_NUM_ATTRIBUTES
];
41 // Calculates the A and B coefficients for the 3 edges of the triangle
43 // maths for edge equations:
44 // standard form of a line in 2d
50 void triangleSetupAB(const __m128 vX
, const __m128 vY
, __m128
& vA
, __m128
& vB
)
52 // vYsub = y1 y2 y0 dc
53 __m128 vYsub
= _mm_shuffle_ps(vY
, vY
, _MM_SHUFFLE(3, 0, 2, 1));
55 vA
= _mm_sub_ps(vY
, vYsub
);
62 // vXsub = x1 x2 x0 dc
63 __m128 vXsub
= _mm_shuffle_ps(vX
, vX
, _MM_SHUFFLE(3, 0, 2, 1));
65 vB
= _mm_sub_ps(vXsub
, vX
);
74 void triangleSetupABVertical(const simdscalar vX
[3], const simdscalar vY
[3], simdscalar (&vA
)[3], simdscalar (&vB
)[3])
76 // generate edge equations
79 vA
[0] = _simd_sub_ps(vY
[0], vY
[1]);
80 vA
[1] = _simd_sub_ps(vY
[1], vY
[2]);
81 vA
[2] = _simd_sub_ps(vY
[2], vY
[0]);
83 vB
[0] = _simd_sub_ps(vX
[1], vX
[0]);
84 vB
[1] = _simd_sub_ps(vX
[2], vX
[1]);
85 vB
[2] = _simd_sub_ps(vX
[0], vX
[2]);
89 void triangleSetupABInt(const __m128i vX
, const __m128i vY
, __m128i
& vA
, __m128i
& vB
)
91 // generate edge equations
95 __m128i vYsub
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 2, 1));
96 vA
= _mm_sub_epi32(vY
, vYsub
);
98 __m128i vXsub
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 2, 1));
99 vB
= _mm_sub_epi32(vXsub
, vX
);
103 void triangleSetupABIntVertical(const simdscalari vX
[3], const simdscalari vY
[3], simdscalari (&vA
)[3], simdscalari (&vB
)[3])
107 vA
[0] = _simd_sub_epi32(vY
[0], vY
[1]);
108 vA
[1] = _simd_sub_epi32(vY
[1], vY
[2]);
109 vA
[2] = _simd_sub_epi32(vY
[2], vY
[0]);
111 vB
[0] = _simd_sub_epi32(vX
[1], vX
[0]);
112 vB
[1] = _simd_sub_epi32(vX
[2], vX
[1]);
113 vB
[2] = _simd_sub_epi32(vX
[0], vX
[2]);
116 #if ENABLE_AVX512_SIMD16
118 void triangleSetupABIntVertical(const simd16scalari vX
[3], const simd16scalari vY
[3], simd16scalari(&vA
)[3], simd16scalari(&vB
)[3])
122 vA
[0] = _simd16_sub_epi32(vY
[0], vY
[1]);
123 vA
[1] = _simd16_sub_epi32(vY
[1], vY
[2]);
124 vA
[2] = _simd16_sub_epi32(vY
[2], vY
[0]);
126 vB
[0] = _simd16_sub_epi32(vX
[1], vX
[0]);
127 vB
[1] = _simd16_sub_epi32(vX
[2], vX
[1]);
128 vB
[2] = _simd16_sub_epi32(vX
[0], vX
[2]);
132 // Calculate the determinant of the triangle
133 // 2 vectors between the 3 points: P, Q
134 // Px = x0-x2, Py = y0-y2
135 // Qx = x1-x2, Qy = y1-y2
137 // det = | | = PxQy - PyQx
139 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
140 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
141 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
142 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
143 // : B[2]*A[1] - A[2]*B[1]
145 float calcDeterminantInt(const __m128i vA
, const __m128i vB
)
147 // vAShuf = [A1, A0, A2, A0]
148 __m128i vAShuf
= _mm_shuffle_epi32(vA
, _MM_SHUFFLE(0, 2, 0, 1));
149 // vBShuf = [B2, B0, B1, B0]
150 __m128i vBShuf
= _mm_shuffle_epi32(vB
, _MM_SHUFFLE(0, 1, 0, 2));
151 // vMul = [A1*B2, B1*A2]
152 __m128i vMul
= _mm_mul_epi32(vAShuf
, vBShuf
);
154 // shuffle upper to lower
155 // vMul2 = [B1*A2, B1*A2]
156 __m128i vMul2
= _mm_shuffle_epi32(vMul
, _MM_SHUFFLE(3, 2, 3, 2));
157 //vMul = [A1*B2 - B1*A2]
158 vMul
= _mm_sub_epi64(vMul
, vMul2
);
161 _mm_store_sd((double*)&result
, _mm_castsi128_pd(vMul
));
163 double dResult
= (double)result
;
164 dResult
= dResult
* (1.0 / FIXED_POINT16_SCALE
);
166 return (float)dResult
;
170 void calcDeterminantIntVertical(const simdscalari vA
[3], const simdscalari vB
[3], simdscalari
*pvDet
)
172 // refer to calcDeterminantInt comment for calculation explanation
174 simdscalari vA1Lo
= _simd_unpacklo_epi32(vA
[1], vA
[1]); // 0 0 1 1 4 4 5 5
175 simdscalari vA1Hi
= _simd_unpackhi_epi32(vA
[1], vA
[1]); // 2 2 3 3 6 6 7 7
177 simdscalari vB2Lo
= _simd_unpacklo_epi32(vB
[2], vB
[2]);
178 simdscalari vB2Hi
= _simd_unpackhi_epi32(vB
[2], vB
[2]);
180 simdscalari vA1B2Lo
= _simd_mul_epi32(vA1Lo
, vB2Lo
); // 0 1 4 5
181 simdscalari vA1B2Hi
= _simd_mul_epi32(vA1Hi
, vB2Hi
); // 2 3 6 7
184 simdscalari vA2Lo
= _simd_unpacklo_epi32(vA
[2], vA
[2]);
185 simdscalari vA2Hi
= _simd_unpackhi_epi32(vA
[2], vA
[2]);
187 simdscalari vB1Lo
= _simd_unpacklo_epi32(vB
[1], vB
[1]);
188 simdscalari vB1Hi
= _simd_unpackhi_epi32(vB
[1], vB
[1]);
190 simdscalari vA2B1Lo
= _simd_mul_epi32(vA2Lo
, vB1Lo
);
191 simdscalari vA2B1Hi
= _simd_mul_epi32(vA2Hi
, vB1Hi
);
194 simdscalari detLo
= _simd_sub_epi64(vA1B2Lo
, vA2B1Lo
);
195 simdscalari detHi
= _simd_sub_epi64(vA1B2Hi
, vA2B1Hi
);
197 // shuffle 0 1 4 5 -> 0 1 2 3
198 simdscalari vResultLo
= _simd_permute2f128_si(detLo
, detHi
, 0x20);
199 simdscalari vResultHi
= _simd_permute2f128_si(detLo
, detHi
, 0x31);
201 pvDet
[0] = vResultLo
;
202 pvDet
[1] = vResultHi
;
205 #if ENABLE_AVX512_SIMD16
207 void calcDeterminantIntVertical(const simd16scalari vA
[3], const simd16scalari vB
[3], simd16scalari
*pvDet
)
209 // refer to calcDeterminantInt comment for calculation explanation
211 simd16scalari vA1Lo
= _simd16_unpacklo_epi32(vA
[1], vA
[1]); // 0 0 1 1 4 4 5 5
212 simd16scalari vA1Hi
= _simd16_unpackhi_epi32(vA
[1], vA
[1]); // 2 2 3 3 6 6 7 7
214 simd16scalari vB2Lo
= _simd16_unpacklo_epi32(vB
[2], vB
[2]);
215 simd16scalari vB2Hi
= _simd16_unpackhi_epi32(vB
[2], vB
[2]);
217 simd16scalari vA1B2Lo
= _simd16_mul_epi32(vA1Lo
, vB2Lo
); // 0 1 4 5
218 simd16scalari vA1B2Hi
= _simd16_mul_epi32(vA1Hi
, vB2Hi
); // 2 3 6 7
221 simd16scalari vA2Lo
= _simd16_unpacklo_epi32(vA
[2], vA
[2]);
222 simd16scalari vA2Hi
= _simd16_unpackhi_epi32(vA
[2], vA
[2]);
224 simd16scalari vB1Lo
= _simd16_unpacklo_epi32(vB
[1], vB
[1]);
225 simd16scalari vB1Hi
= _simd16_unpackhi_epi32(vB
[1], vB
[1]);
227 simd16scalari vA2B1Lo
= _simd16_mul_epi32(vA2Lo
, vB1Lo
);
228 simd16scalari vA2B1Hi
= _simd16_mul_epi32(vA2Hi
, vB1Hi
);
231 simd16scalari detLo
= _simd16_sub_epi64(vA1B2Lo
, vA2B1Lo
);
232 simd16scalari detHi
= _simd16_sub_epi64(vA1B2Hi
, vA2B1Hi
);
234 // shuffle 0 1 4 5 -> 0 1 2 3
235 simd16scalari vResultLo
= _simd16_permute2f128_si(detLo
, detHi
, 0x20);
236 simd16scalari vResultHi
= _simd16_permute2f128_si(detLo
, detHi
, 0x31);
238 pvDet
[0] = vResultLo
;
239 pvDet
[1] = vResultHi
;
244 void triangleSetupC(const __m128 vX
, const __m128 vY
, const __m128 vA
, const __m128
&vB
, __m128
&vC
)
247 vC
= _mm_mul_ps(vA
, vX
);
248 __m128 vCy
= _mm_mul_ps(vB
, vY
);
249 vC
= _mm_mul_ps(vC
, _mm_set1_ps(-1.0f
));
250 vC
= _mm_sub_ps(vC
, vCy
);
254 void viewportTransform(__m128
&vX
, __m128
&vY
, __m128
&vZ
, const SWR_VIEWPORT_MATRIX
&vpMatrix
)
256 vX
= _mm_mul_ps(vX
, _mm_set1_ps(vpMatrix
.m00
));
257 vX
= _mm_add_ps(vX
, _mm_set1_ps(vpMatrix
.m30
));
259 vY
= _mm_mul_ps(vY
, _mm_set1_ps(vpMatrix
.m11
));
260 vY
= _mm_add_ps(vY
, _mm_set1_ps(vpMatrix
.m31
));
262 vZ
= _mm_mul_ps(vZ
, _mm_set1_ps(vpMatrix
.m22
));
263 vZ
= _mm_add_ps(vZ
, _mm_set1_ps(vpMatrix
.m32
));
266 template<uint32_t NumVerts
>
268 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
270 simdscalar m00
= _simd_load1_ps(&vpMatrices
.m00
[0]);
271 simdscalar m30
= _simd_load1_ps(&vpMatrices
.m30
[0]);
272 simdscalar m11
= _simd_load1_ps(&vpMatrices
.m11
[0]);
273 simdscalar m31
= _simd_load1_ps(&vpMatrices
.m31
[0]);
274 simdscalar m22
= _simd_load1_ps(&vpMatrices
.m22
[0]);
275 simdscalar m32
= _simd_load1_ps(&vpMatrices
.m32
[0]);
277 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
279 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
280 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
281 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
285 #if USE_SIMD16_FRONTEND
286 template<uint32_t NumVerts
>
288 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
)
290 const simd16scalar m00
= _simd16_broadcast_ss(&vpMatrices
.m00
[0]);
291 const simd16scalar m30
= _simd16_broadcast_ss(&vpMatrices
.m30
[0]);
292 const simd16scalar m11
= _simd16_broadcast_ss(&vpMatrices
.m11
[0]);
293 const simd16scalar m31
= _simd16_broadcast_ss(&vpMatrices
.m31
[0]);
294 const simd16scalar m22
= _simd16_broadcast_ss(&vpMatrices
.m22
[0]);
295 const simd16scalar m32
= _simd16_broadcast_ss(&vpMatrices
.m32
[0]);
297 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
299 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
300 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
301 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
306 template<uint32_t NumVerts
>
308 void viewportTransform(simdvector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simdscalari vViewportIdx
)
310 // perform a gather of each matrix element based on the viewport array indexes
311 simdscalar m00
= _simd_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
312 simdscalar m30
= _simd_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
313 simdscalar m11
= _simd_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
314 simdscalar m31
= _simd_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
315 simdscalar m22
= _simd_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
316 simdscalar m32
= _simd_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
318 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
320 v
[i
].x
= _simd_fmadd_ps(v
[i
].x
, m00
, m30
);
321 v
[i
].y
= _simd_fmadd_ps(v
[i
].y
, m11
, m31
);
322 v
[i
].z
= _simd_fmadd_ps(v
[i
].z
, m22
, m32
);
326 #if USE_SIMD16_FRONTEND
327 template<uint32_t NumVerts
>
329 void viewportTransform(simd16vector
*v
, const SWR_VIEWPORT_MATRICES
& vpMatrices
, simd16scalari vViewportIdx
)
331 // perform a gather of each matrix element based on the viewport array indexes
332 const simd16scalar m00
= _simd16_i32gather_ps(&vpMatrices
.m00
[0], vViewportIdx
, 4);
333 const simd16scalar m30
= _simd16_i32gather_ps(&vpMatrices
.m30
[0], vViewportIdx
, 4);
334 const simd16scalar m11
= _simd16_i32gather_ps(&vpMatrices
.m11
[0], vViewportIdx
, 4);
335 const simd16scalar m31
= _simd16_i32gather_ps(&vpMatrices
.m31
[0], vViewportIdx
, 4);
336 const simd16scalar m22
= _simd16_i32gather_ps(&vpMatrices
.m22
[0], vViewportIdx
, 4);
337 const simd16scalar m32
= _simd16_i32gather_ps(&vpMatrices
.m32
[0], vViewportIdx
, 4);
339 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
341 v
[i
].x
= _simd16_fmadd_ps(v
[i
].x
, m00
, m30
);
342 v
[i
].y
= _simd16_fmadd_ps(v
[i
].y
, m11
, m31
);
343 v
[i
].z
= _simd16_fmadd_ps(v
[i
].z
, m22
, m32
);
349 void calcBoundingBoxInt(const __m128i
&vX
, const __m128i
&vY
, SWR_RECT
&bbox
)
351 // Need horizontal fp min here
352 __m128i vX1
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 2, 0, 1));
353 __m128i vX2
= _mm_shuffle_epi32(vX
, _MM_SHUFFLE(3, 0, 1, 2));
355 __m128i vY1
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 2, 0, 1));
356 __m128i vY2
= _mm_shuffle_epi32(vY
, _MM_SHUFFLE(3, 0, 1, 2));
359 __m128i vMinX
= _mm_min_epi32(vX
, vX1
);
360 vMinX
= _mm_min_epi32(vMinX
, vX2
);
362 __m128i vMaxX
= _mm_max_epi32(vX
, vX1
);
363 vMaxX
= _mm_max_epi32(vMaxX
, vX2
);
365 __m128i vMinY
= _mm_min_epi32(vY
, vY1
);
366 vMinY
= _mm_min_epi32(vMinY
, vY2
);
368 __m128i vMaxY
= _mm_max_epi32(vY
, vY1
);
369 vMaxY
= _mm_max_epi32(vMaxY
, vY2
);
371 bbox
.xmin
= _mm_extract_epi32(vMinX
, 0);
372 bbox
.xmax
= _mm_extract_epi32(vMaxX
, 0);
373 bbox
.ymin
= _mm_extract_epi32(vMinY
, 0);
374 bbox
.ymax
= _mm_extract_epi32(vMaxY
, 0);
378 bool CanUseSimplePoints(DRAW_CONTEXT
*pDC
)
380 const API_STATE
& state
= GetApiState(pDC
);
382 return (state
.rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&&
383 state
.rastState
.pointSize
== 1.0f
&&
384 !state
.rastState
.pointParam
&&
385 !state
.rastState
.pointSpriteEnable
);
389 bool vHasNaN(const __m128
& vec
)
391 const __m128 result
= _mm_cmpunord_ps(vec
, vec
);
392 const int32_t mask
= _mm_movemask_ps(result
);
396 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numElements
);
397 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
);
400 // ProcessDraw front-end function. All combinations of parameter values are available
401 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
403 bool IsCutIndexEnabled
,
404 bool HasTessellation
,
405 bool HasGeometryShader
,
407 bool HasRasterization
);
409 void ProcessClear(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
410 void ProcessStoreTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
411 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
412 void ProcessSync(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
413 void ProcessShutdown(SWR_CONTEXT
*pContext
, DRAW_CONTEXT
*pDC
, uint32_t workerId
, void *pUserData
);
415 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
);
416 #if USE_SIMD16_FRONTEND
417 PFN_PROCESS_PRIMS_SIMD16
GetBinTrianglesFunc_simd16(bool IsConservative
);
420 struct PA_STATE_BASE
; // forward decl
421 void BinPoints(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
422 void BinLines(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simdvector prims
[3], uint32_t primMask
, simdscalari primID
, simdscalari viewportIdx
);
423 #if USE_SIMD16_FRONTEND
424 void BinPoints_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);
425 void BinLines_simd16(DRAW_CONTEXT
*pDC
, PA_STATE
& pa
, uint32_t workerId
, simd16vector prims
[3], uint32_t primMask
, simd16scalari primID
, simd16scalari viewportIdx
);