swr/rast: Separate RDTSC code from archrast
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / frontend.h
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file frontend.h
24 *
25 * @brief Definitions for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
27 *
28 ******************************************************************************/
29 #pragma once
30 #include "context.h"
31 #include "common/simdintrin.h"
32 #include <type_traits>
33
34 // Calculates the A and B coefficients for the 3 edges of the triangle
35 //
36 // maths for edge equations:
37 // standard form of a line in 2d
38 // Ax + By + C = 0
39 // A = y0 - y1
40 // B = x1 - x0
41 // C = x0y1 - x1y0
42 INLINE
43 void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
44 {
45 // vYsub = y1 y2 y0 dc
46 __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
47 // vY = y0 y1 y2 dc
48 vA = _mm_sub_ps(vY, vYsub);
49
50 // Result:
51 // A[0] = y0 - y1
52 // A[1] = y1 - y2
53 // A[2] = y2 - y0
54
55 // vXsub = x1 x2 x0 dc
56 __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
57 // vX = x0 x1 x2 dc
58 vB = _mm_sub_ps(vXsub, vX);
59
60 // Result:
61 // B[0] = x1 - x0
62 // B[1] = x2 - x1
63 // B[2] = x0 - x2
64 }
65
66 INLINE
67 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
68 {
69 // generate edge equations
70 // A = y0 - y1
71 // B = x1 - x0
72 // C = x0y1 - x1y0
73 __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
74 vA = _mm_sub_epi32(vY, vYsub);
75
76 __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
77 vB = _mm_sub_epi32(vXsub, vX);
78 }
79
80 INLINE
81 void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
82 {
83 // A = y0 - y1
84 // B = x1 - x0
85 vA[0] = _simd_sub_epi32(vY[0], vY[1]);
86 vA[1] = _simd_sub_epi32(vY[1], vY[2]);
87 vA[2] = _simd_sub_epi32(vY[2], vY[0]);
88
89 vB[0] = _simd_sub_epi32(vX[1], vX[0]);
90 vB[1] = _simd_sub_epi32(vX[2], vX[1]);
91 vB[2] = _simd_sub_epi32(vX[0], vX[2]);
92 }
93
94 #if ENABLE_AVX512_SIMD16
95 INLINE
96 void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3])
97 {
98 // A = y0 - y1
99 // B = x1 - x0
100 vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
101 vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
102 vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
103
104 vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
105 vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
106 vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
107 }
108
109 #endif
110 // Calculate the determinant of the triangle
111 // 2 vectors between the 3 points: P, Q
112 // Px = x0-x2, Py = y0-y2
113 // Qx = x1-x2, Qy = y1-y2
114 // |Px Qx|
115 // det = | | = PxQy - PyQx
116 // |Py Qy|
117 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
118 // try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
119 // : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
120 // : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
121 // : B[2]*A[1] - A[2]*B[1]
122 INLINE
123 float calcDeterminantInt(const __m128i vA, const __m128i vB)
124 {
125 // vAShuf = [A1, A0, A2, A0]
126 __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
127 // vBShuf = [B2, B0, B1, B0]
128 __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
129 // vMul = [A1*B2, B1*A2]
130 __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
131
132 // shuffle upper to lower
133 // vMul2 = [B1*A2, B1*A2]
134 __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
135 //vMul = [A1*B2 - B1*A2]
136 vMul = _mm_sub_epi64(vMul, vMul2);
137
138 int64_t result;
139 _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
140
141 double dResult = (double)result;
142 dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
143
144 return (float)dResult;
145 }
146
147 INLINE
148 void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
149 {
150 // refer to calcDeterminantInt comment for calculation explanation
151
152 // A1*B2
153 simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
154 simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
155
156 simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
157 simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
158
159 simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
160 simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
161
162 // B1*A2
163 simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
164 simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
165
166 simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
167 simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
168
169 simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
170 simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
171
172 // A1*B2 - A2*B1
173 simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
174 simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
175
176 // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
177 simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
178
179 // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
180 simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
181
182 pvDet[0] = vResultLo;
183 pvDet[1] = vResultHi;
184 }
185
186 #if ENABLE_AVX512_SIMD16
187 INLINE
188 void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet)
189 {
190 // refer to calcDeterminantInt comment for calculation explanation
191
192 // A1*B2
193 simd16scalari vA1_lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
194 simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
195
196 simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
197 simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
198
199 simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b)
200 simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F
201
202 // B1*A2
203 simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
204 simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
205
206 simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
207 simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
208
209 simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
210 simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
211
212 // A1*B2 - A2*B1
213 simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b)
214 simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F
215
216 // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
217 simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b)
218 simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F
219
220 // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
221 pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
222 pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F
223 }
224
225 #endif
226 INLINE
227 void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
228 {
229 // C = -Ax - By
230 vC = _mm_mul_ps(vA, vX);
231 __m128 vCy = _mm_mul_ps(vB, vY);
232 vC = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
233 vC = _mm_sub_ps(vC, vCy);
234 }
235
236 template<uint32_t NumVerts>
237 INLINE
238 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
239 {
240 simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
241 simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
242 simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
243 simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
244 simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
245 simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
246
247 for (uint32_t i = 0; i < NumVerts; ++i)
248 {
249 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
250 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
251 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
252 }
253 }
254
255 #if USE_SIMD16_FRONTEND
256 template<uint32_t NumVerts>
257 INLINE
258 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
259 {
260 const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
261 const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
262 const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
263 const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
264 const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
265 const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
266
267 for (uint32_t i = 0; i < NumVerts; ++i)
268 {
269 v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
270 v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
271 v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
272 }
273 }
274
275 #endif
276 template<uint32_t NumVerts>
277 INLINE
278 void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari const &vViewportIdx)
279 {
280 // perform a gather of each matrix element based on the viewport array indexes
281 simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
282 simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
283 simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
284 simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
285 simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
286 simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
287
288 for (uint32_t i = 0; i < NumVerts; ++i)
289 {
290 v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
291 v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
292 v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
293 }
294 }
295
296 #if USE_SIMD16_FRONTEND
297 template<uint32_t NumVerts>
298 INLINE
299 void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari const &vViewportIdx)
300 {
301 // perform a gather of each matrix element based on the viewport array indexes
302 const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
303 const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
304 const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
305 const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
306 const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
307 const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
308
309 for (uint32_t i = 0; i < NumVerts; ++i)
310 {
311 v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
312 v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
313 v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
314 }
315 }
316
317 #endif
318 INLINE
319 void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox)
320 {
321 // Need horizontal fp min here
322 __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
323 __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
324
325 __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
326 __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
327
328
329 __m128i vMinX = _mm_min_epi32(vX, vX1);
330 vMinX = _mm_min_epi32(vMinX, vX2);
331
332 __m128i vMaxX = _mm_max_epi32(vX, vX1);
333 vMaxX = _mm_max_epi32(vMaxX, vX2);
334
335 __m128i vMinY = _mm_min_epi32(vY, vY1);
336 vMinY = _mm_min_epi32(vMinY, vY2);
337
338 __m128i vMaxY = _mm_max_epi32(vY, vY1);
339 vMaxY = _mm_max_epi32(vMaxY, vY2);
340
341 bbox.xmin = _mm_extract_epi32(vMinX, 0);
342 bbox.xmax = _mm_extract_epi32(vMaxX, 0);
343 bbox.ymin = _mm_extract_epi32(vMinY, 0);
344 bbox.ymax = _mm_extract_epi32(vMaxY, 0);
345 }
346
347 INLINE
348 bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
349 {
350 const API_STATE& state = GetApiState(pDC);
351
352 return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
353 state.rastState.pointSize == 1.0f &&
354 !state.rastState.pointParam &&
355 !state.rastState.pointSpriteEnable &&
356 !state.backendState.clipDistanceMask);
357 }
358
359 INLINE
360 bool vHasNaN(const __m128& vec)
361 {
362 const __m128 result = _mm_cmpunord_ps(vec, vec);
363 const int32_t mask = _mm_movemask_ps(result);
364 return (mask != 0);
365 }
366
367 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
368 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
369
370
371 // ProcessDraw front-end function. All combinations of parameter values are available
372 PFN_FE_WORK_FUNC GetProcessDrawFunc(
373 bool IsIndexed,
374 bool IsCutIndexEnabled,
375 bool HasTessellation,
376 bool HasGeometryShader,
377 bool HasStreamOut,
378 bool HasRasterization);
379
380 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
381 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
382 void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
383 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
384 void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
385
386 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
387 #if USE_SIMD16_FRONTEND
388 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
389 #endif
390
391 struct PA_STATE_BASE; // forward decl
392 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
393 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
394 #if USE_SIMD16_FRONTEND
395 void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
396 void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
397 #endif
398