swr/rasterizer: modernize thread TLB
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "backends/gen_rasterizer.hpp"
34 #include "rdtsc_core.h"
35 #include "backend.h"
36 #include "utils.h"
37 #include "frontend.h"
38 #include "tilemgr.h"
39 #include "memory/tilingtraits.h"
40 #include "rasterizer_impl.h"
41
42 PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
43 [STATE_VALID_TRI_EDGE_COUNT][2];
44
45 void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
46 {
47 const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData);
48 #if KNOB_ENABLE_TOSS_POINTS
49 if (KNOB_TOSS_BIN_TRIS)
50 {
51 return;
52 }
53 #endif
54
55 // bloat line to two tris and call the triangle rasterizer twice
56 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, pDC->drawId);
57
58 const API_STATE& state = GetApiState(pDC);
59 const SWR_RASTSTATE& rastState = state.rastState;
60
61 // macrotile dimensioning
62 uint32_t macroX, macroY;
63 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
64 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
65 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
66 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
67 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
68
69 const SWR_RECT& scissorInFixedPoint =
70 state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
71
72 // create a copy of the triangle buffer to write our adjusted vertices to
73 OSALIGNSIMD(float) newTriBuffer[4 * 4];
74 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
75 newWorkDesc.pTriBuffer = &newTriBuffer[0];
76
77 // create a copy of the attrib buffer to write our adjusted attribs to
78 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
79 newWorkDesc.pAttribs = &newAttribBuffer[0];
80
81 const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
82 const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
83
84 __m128 vX, vY, vZ, vRecipW;
85
86 vX = _mm_load_ps(workDesc.pTriBuffer);
87 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
88 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
89 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
90
91 // triangle 0
92 // v0,v1 -> v0,v0,v1
93 __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
94 __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
95 __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
96 __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
97
98 __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
99 __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
100 if (workDesc.triFlags.yMajor)
101 {
102 vXa = _mm_add_ps(vAdjust, vXa);
103 }
104 else
105 {
106 vYa = _mm_add_ps(vAdjust, vYa);
107 }
108
109 // Store triangle description for rasterizer
110 _mm_store_ps((float*)&newTriBuffer[0], vXa);
111 _mm_store_ps((float*)&newTriBuffer[4], vYa);
112 _mm_store_ps((float*)&newTriBuffer[8], vZa);
113 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
114
115 // binner bins 3 edges for lines as v0, v1, v1
116 // tri0 needs v0, v0, v1
117 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
118 {
119 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
120 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
121
122 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
123 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
124 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
125 }
126
127 // Store user clip distances for triangle 0
128 float newClipBuffer[3 * 8];
129 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
130 if (numClipDist)
131 {
132 newWorkDesc.pUserClipBuffer = newClipBuffer;
133
134 float* pOldBuffer = workDesc.pUserClipBuffer;
135 float* pNewBuffer = newClipBuffer;
136 for (uint32_t i = 0; i < numClipDist; ++i)
137 {
138 // read barycentric coeffs from binner
139 float a = *(pOldBuffer++);
140 float b = *(pOldBuffer++);
141
142 // reconstruct original clip distance at vertices
143 float c0 = a + b;
144 float c1 = b;
145
146 // construct triangle barycentrics
147 *(pNewBuffer++) = c0 - c1;
148 *(pNewBuffer++) = c0 - c1;
149 *(pNewBuffer++) = c1;
150 }
151 }
152
153 // setup triangle rasterizer function
154 PFN_WORK_FUNC pfnTriRast;
155 // conservative rast not supported for points/lines
156 pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
157 rastState.bIsCenterPattern,
158 false,
159 SWR_INPUT_COVERAGE_NONE,
160 EdgeValToEdgeState(ALL_EDGES_VALID),
161 (pDC->pState->state.scissorsTileAligned == false));
162
163 // make sure this macrotile intersects the triangle
164 __m128i vXai = fpToFixedPoint(vXa);
165 __m128i vYai = fpToFixedPoint(vYa);
166 OSALIGNSIMD(SWR_RECT) bboxA;
167 calcBoundingBoxInt(vXai, vYai, bboxA);
168
169 if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
170 bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
171 bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
172 bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
173 {
174 // rasterize triangle
175 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
176 }
177
178 // triangle 1
179 // v0,v1 -> v1,v1,v0
180 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
181 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
182 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
183 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
184
185 vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
186 if (workDesc.triFlags.yMajor)
187 {
188 vXa = _mm_add_ps(vAdjust, vXa);
189 }
190 else
191 {
192 vYa = _mm_add_ps(vAdjust, vYa);
193 }
194
195 // Store triangle description for rasterizer
196 _mm_store_ps((float*)&newTriBuffer[0], vXa);
197 _mm_store_ps((float*)&newTriBuffer[4], vYa);
198 _mm_store_ps((float*)&newTriBuffer[8], vZa);
199 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
200
201 // binner bins 3 edges for lines as v0, v1, v1
202 // tri1 needs v1, v1, v0
203 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
204 {
205 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
206 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
207
208 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
209 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
210 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
211 }
212
213 // store user clip distance for triangle 1
214 if (numClipDist)
215 {
216 float* pOldBuffer = workDesc.pUserClipBuffer;
217 float* pNewBuffer = newClipBuffer;
218 for (uint32_t i = 0; i < numClipDist; ++i)
219 {
220 // read barycentric coeffs from binner
221 float a = *(pOldBuffer++);
222 float b = *(pOldBuffer++);
223
224 // reconstruct original clip distance at vertices
225 float c0 = a + b;
226 float c1 = b;
227
228 // construct triangle barycentrics
229 *(pNewBuffer++) = c1 - c0;
230 *(pNewBuffer++) = c1 - c0;
231 *(pNewBuffer++) = c0;
232 }
233 }
234
235 vXai = fpToFixedPoint(vXa);
236 vYai = fpToFixedPoint(vYa);
237 calcBoundingBoxInt(vXai, vYai, bboxA);
238
239 if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
240 bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
241 bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
242 bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
243 {
244 // rasterize triangle
245 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
246 }
247
248 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, 1);
249 }
250
251 void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
252 {
253 #if KNOB_ENABLE_TOSS_POINTS
254 if (KNOB_TOSS_BIN_TRIS)
255 {
256 return;
257 }
258 #endif
259
260 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
261 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
262
263 // map x,y relative offsets from start of raster tile to bit position in
264 // coverage mask for the point
265 static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13},
266 {2, 3, 6, 7, 10, 11, 14, 15},
267 {16, 17, 20, 21, 24, 25, 28, 29},
268 {18, 19, 22, 23, 26, 27, 30, 31},
269 {32, 33, 36, 37, 40, 41, 44, 45},
270 {34, 35, 38, 39, 42, 43, 46, 47},
271 {48, 49, 52, 53, 56, 57, 60, 61},
272 {50, 51, 54, 55, 58, 59, 62, 63}};
273
274 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
275
276 // pull point information from triangle buffer
277 // @todo use structs for readability
278 uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
279 uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
280 float z = *(workDesc.pTriBuffer + 2);
281
282 // construct triangle descriptor for point
283 // no interpolation, set up i,j for constant interpolation of z and attribs
284 // @todo implement an optimized backend that doesn't require triangle information
285
286 // compute coverage mask from x,y packed into the coverageMask flag
287 // mask indices by the maximum valid index for x/y of coveragemap.
288 uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
289 uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
290 for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
291 {
292 triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
293 }
294 triDesc.anyCoveredSamples = triDesc.coverageMask[0];
295 triDesc.innerCoverageMask = triDesc.coverageMask[0];
296
297 // no persp divide needed for points
298 triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
299 triDesc.triFlags = workDesc.triFlags;
300 triDesc.recipDet = 1.0f;
301 triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
302 triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
303 triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
304 triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
305
306 RenderOutputBuffers renderBuffers;
307 GetRenderHotTiles(pDC,
308 workerId,
309 macroTile,
310 tileAlignedX >> KNOB_TILE_X_DIM_SHIFT,
311 tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
312 renderBuffers,
313 triDesc.triFlags.renderTargetArrayIndex);
314
315 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
316 backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
317 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
318 }
319
320 void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
321 {
322 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
323 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
324 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
325
326 bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
327
328 // load point vertex
329 float x = *workDesc.pTriBuffer;
330 float y = *(workDesc.pTriBuffer + 1);
331 float z = *(workDesc.pTriBuffer + 2);
332
333 // create a copy of the triangle buffer to write our adjusted vertices to
334 OSALIGNSIMD(float) newTriBuffer[4 * 4];
335 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
336 newWorkDesc.pTriBuffer = &newTriBuffer[0];
337
338 // create a copy of the attrib buffer to write our adjusted attribs to
339 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
340 newWorkDesc.pAttribs = &newAttribBuffer[0];
341
342 newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
343 newWorkDesc.numAttribs = workDesc.numAttribs;
344 newWorkDesc.triFlags = workDesc.triFlags;
345
346 // construct two tris by bloating point by point size
347 float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
348 float lowerX = x - halfPointSize;
349 float upperX = x + halfPointSize;
350 float lowerY = y - halfPointSize;
351 float upperY = y + halfPointSize;
352
353 // tri 0
354 float* pBuf = &newTriBuffer[0];
355 *pBuf++ = lowerX;
356 *pBuf++ = lowerX;
357 *pBuf++ = upperX;
358 pBuf++;
359 *pBuf++ = lowerY;
360 *pBuf++ = upperY;
361 *pBuf++ = upperY;
362 pBuf++;
363 _mm_store_ps(pBuf, _mm_set1_ps(z));
364 _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
365
366 // setup triangle rasterizer function
367 PFN_WORK_FUNC pfnTriRast;
368 // conservative rast not supported for points/lines
369 pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
370 rastState.bIsCenterPattern,
371 false,
372 SWR_INPUT_COVERAGE_NONE,
373 EdgeValToEdgeState(ALL_EDGES_VALID),
374 (pDC->pState->state.scissorsTileAligned == false));
375
376 // overwrite texcoords for point sprites
377 if (isPointSpriteTexCoordEnabled)
378 {
379 // copy original attribs
380 memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
381 newWorkDesc.pAttribs = &newAttribBuffer[0];
382
383 // overwrite texcoord for point sprites
384 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
385 DWORD texCoordAttrib = 0;
386
387 while (_BitScanForward(&texCoordAttrib, texCoordMask))
388 {
389 texCoordMask &= ~(1 << texCoordAttrib);
390 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
391 if (rastState.pointSpriteTopOrigin)
392 {
393 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
394 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
395 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
396 }
397 else
398 {
399 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
400 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
401 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
402 }
403 }
404 }
405 else
406 {
407 // no texcoord overwrite, can reuse the attrib buffer from frontend
408 newWorkDesc.pAttribs = workDesc.pAttribs;
409 }
410
411 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
412
413 // tri 1
414 pBuf = &newTriBuffer[0];
415 *pBuf++ = lowerX;
416 *pBuf++ = upperX;
417 *pBuf++ = upperX;
418 pBuf++;
419 *pBuf++ = lowerY;
420 *pBuf++ = upperY;
421 *pBuf++ = lowerY;
422 // z, w unchanged
423
424 if (isPointSpriteTexCoordEnabled)
425 {
426 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
427 DWORD texCoordAttrib = 0;
428
429 while (_BitScanForward(&texCoordAttrib, texCoordMask))
430 {
431 texCoordMask &= ~(1 << texCoordAttrib);
432 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
433 if (rastState.pointSpriteTopOrigin)
434 {
435 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
436 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
437 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
438 }
439 else
440 {
441 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
442 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
443 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
444 }
445 }
446 }
447
448 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
449 }
450
451 void InitRasterizerFunctions()
452 {
453 InitRasterizerFuncs();
454 }
455
456 // Selector for correct templated RasterizeTriangle function
457 PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
458 bool IsCenter,
459 bool IsConservative,
460 SWR_INPUT_COVERAGE InputCoverage,
461 uint32_t EdgeEnable,
462 bool RasterizeScissorEdges)
463 {
464 SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
465 SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
466 SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
467
468 PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage]
469 [EdgeEnable][RasterizeScissorEdges];
470 SWR_ASSERT(func);
471
472 return func;
473 }