swr/rast: cache line align hottile buffers
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "backends/gen_rasterizer.hpp"
34 #include "rdtsc_core.h"
35 #include "backend.h"
36 #include "utils.h"
37 #include "frontend.h"
38 #include "tilemgr.h"
39 #include "memory/tilingtraits.h"
40 #include "rasterizer_impl.h"
41
42 PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
43
44 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
45 {
46 SWR_CONTEXT *pContext = pDC->pContext;
47 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
48 #if KNOB_ENABLE_TOSS_POINTS
49 if (KNOB_TOSS_BIN_TRIS)
50 {
51 return;
52 }
53 #endif
54
55 // bloat line to two tris and call the triangle rasterizer twice
56 AR_BEGIN(BERasterizeLine, pDC->drawId);
57
58 const API_STATE &state = GetApiState(pDC);
59 const SWR_RASTSTATE &rastState = state.rastState;
60
61 // macrotile dimensioning
62 uint32_t macroX, macroY;
63 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
64 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
65 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
66 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
67 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
68
69 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
70
71 // create a copy of the triangle buffer to write our adjusted vertices to
72 OSALIGNSIMD(float) newTriBuffer[4 * 4];
73 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
74 newWorkDesc.pTriBuffer = &newTriBuffer[0];
75
76 // create a copy of the attrib buffer to write our adjusted attribs to
77 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
78 newWorkDesc.pAttribs = &newAttribBuffer[0];
79
80 const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
81 const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
82
83 __m128 vX, vY, vZ, vRecipW;
84
85 vX = _mm_load_ps(workDesc.pTriBuffer);
86 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
87 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
88 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
89
90 // triangle 0
91 // v0,v1 -> v0,v0,v1
92 __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
93 __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
94 __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
95 __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
96
97 __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
98 __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
99 if (workDesc.triFlags.yMajor)
100 {
101 vXa = _mm_add_ps(vAdjust, vXa);
102 }
103 else
104 {
105 vYa = _mm_add_ps(vAdjust, vYa);
106 }
107
108 // Store triangle description for rasterizer
109 _mm_store_ps((float*)&newTriBuffer[0], vXa);
110 _mm_store_ps((float*)&newTriBuffer[4], vYa);
111 _mm_store_ps((float*)&newTriBuffer[8], vZa);
112 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
113
114 // binner bins 3 edges for lines as v0, v1, v1
115 // tri0 needs v0, v0, v1
116 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
117 {
118 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
119 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
120
121 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
122 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
123 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
124 }
125
126 // Store user clip distances for triangle 0
127 float newClipBuffer[3 * 8];
128 uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
129 if (numClipDist)
130 {
131 newWorkDesc.pUserClipBuffer = newClipBuffer;
132
133 float* pOldBuffer = workDesc.pUserClipBuffer;
134 float* pNewBuffer = newClipBuffer;
135 for (uint32_t i = 0; i < numClipDist; ++i)
136 {
137 // read barycentric coeffs from binner
138 float a = *(pOldBuffer++);
139 float b = *(pOldBuffer++);
140
141 // reconstruct original clip distance at vertices
142 float c0 = a + b;
143 float c1 = b;
144
145 // construct triangle barycentrics
146 *(pNewBuffer++) = c0 - c1;
147 *(pNewBuffer++) = c0 - c1;
148 *(pNewBuffer++) = c1;
149 }
150 }
151
152 // setup triangle rasterizer function
153 PFN_WORK_FUNC pfnTriRast;
154 // conservative rast not supported for points/lines
155 pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
156 SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
157
158 // make sure this macrotile intersects the triangle
159 __m128i vXai = fpToFixedPoint(vXa);
160 __m128i vYai = fpToFixedPoint(vYa);
161 OSALIGNSIMD(SWR_RECT) bboxA;
162 calcBoundingBoxInt(vXai, vYai, bboxA);
163
164 if (!(bboxA.xmin > macroBoxRight ||
165 bboxA.xmin > scissorInFixedPoint.xmax ||
166 bboxA.xmax - 1 < macroBoxLeft ||
167 bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
168 bboxA.ymin > macroBoxBottom ||
169 bboxA.ymin > scissorInFixedPoint.ymax ||
170 bboxA.ymax - 1 < macroBoxTop ||
171 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
172 // rasterize triangle
173 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
174 }
175
176 // triangle 1
177 // v0,v1 -> v1,v1,v0
178 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
179 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
180 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
181 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
182
183 vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
184 if (workDesc.triFlags.yMajor)
185 {
186 vXa = _mm_add_ps(vAdjust, vXa);
187 }
188 else
189 {
190 vYa = _mm_add_ps(vAdjust, vYa);
191 }
192
193 // Store triangle description for rasterizer
194 _mm_store_ps((float*)&newTriBuffer[0], vXa);
195 _mm_store_ps((float*)&newTriBuffer[4], vYa);
196 _mm_store_ps((float*)&newTriBuffer[8], vZa);
197 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
198
199 // binner bins 3 edges for lines as v0, v1, v1
200 // tri1 needs v1, v1, v0
201 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
202 {
203 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
204 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
205
206 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
207 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
208 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
209 }
210
211 // store user clip distance for triangle 1
212 if (numClipDist)
213 {
214 float* pOldBuffer = workDesc.pUserClipBuffer;
215 float* pNewBuffer = newClipBuffer;
216 for (uint32_t i = 0; i < numClipDist; ++i)
217 {
218 // read barycentric coeffs from binner
219 float a = *(pOldBuffer++);
220 float b = *(pOldBuffer++);
221
222 // reconstruct original clip distance at vertices
223 float c0 = a + b;
224 float c1 = b;
225
226 // construct triangle barycentrics
227 *(pNewBuffer++) = c1 - c0;
228 *(pNewBuffer++) = c1 - c0;
229 *(pNewBuffer++) = c0;
230 }
231 }
232
233 vXai = fpToFixedPoint(vXa);
234 vYai = fpToFixedPoint(vYa);
235 calcBoundingBoxInt(vXai, vYai, bboxA);
236
237 if (!(bboxA.xmin > macroBoxRight ||
238 bboxA.xmin > scissorInFixedPoint.xmax ||
239 bboxA.xmax - 1 < macroBoxLeft ||
240 bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
241 bboxA.ymin > macroBoxBottom ||
242 bboxA.ymin > scissorInFixedPoint.ymax ||
243 bboxA.ymax - 1 < macroBoxTop ||
244 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
245 // rasterize triangle
246 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
247 }
248
249 AR_END(BERasterizeLine, 1);
250 }
251
252 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
253 {
254 SWR_CONTEXT *pContext = pDC->pContext;
255
256 #if KNOB_ENABLE_TOSS_POINTS
257 if (KNOB_TOSS_BIN_TRIS)
258 {
259 return;
260 }
261 #endif
262
263 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
264 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
265
266 // map x,y relative offsets from start of raster tile to bit position in
267 // coverage mask for the point
268 static const uint32_t coverageMap[8][8] = {
269 { 0, 1, 4, 5, 8, 9, 12, 13 },
270 { 2, 3, 6, 7, 10, 11, 14, 15 },
271 { 16, 17, 20, 21, 24, 25, 28, 29 },
272 { 18, 19, 22, 23, 26, 27, 30, 31 },
273 { 32, 33, 36, 37, 40, 41, 44, 45 },
274 { 34, 35, 38, 39, 42, 43, 46, 47 },
275 { 48, 49, 52, 53, 56, 57, 60, 61 },
276 { 50, 51, 54, 55, 58, 59, 62, 63 }
277 };
278
279 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
280
281 // pull point information from triangle buffer
282 // @todo use structs for readability
283 uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
284 uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
285 float z = *(workDesc.pTriBuffer + 2);
286
287 // construct triangle descriptor for point
288 // no interpolation, set up i,j for constant interpolation of z and attribs
289 // @todo implement an optimized backend that doesn't require triangle information
290
291 // compute coverage mask from x,y packed into the coverageMask flag
292 // mask indices by the maximum valid index for x/y of coveragemap.
293 uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
294 uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
295 // todo: multisample points?
296 triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
297
298 // no persp divide needed for points
299 triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
300 triDesc.triFlags = workDesc.triFlags;
301 triDesc.recipDet = 1.0f;
302 triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
303 triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
304 triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
305 triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
306
307 RenderOutputBuffers renderBuffers;
308 GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
309 renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
310
311 AR_BEGIN(BEPixelBackend, pDC->drawId);
312 backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
313 AR_END(BEPixelBackend, 0);
314 }
315
316 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
317 {
318 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
319 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
320 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
321
322 bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
323
324 // load point vertex
325 float x = *workDesc.pTriBuffer;
326 float y = *(workDesc.pTriBuffer + 1);
327 float z = *(workDesc.pTriBuffer + 2);
328
329 // create a copy of the triangle buffer to write our adjusted vertices to
330 OSALIGNSIMD(float) newTriBuffer[4 * 4];
331 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
332 newWorkDesc.pTriBuffer = &newTriBuffer[0];
333
334 // create a copy of the attrib buffer to write our adjusted attribs to
335 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
336 newWorkDesc.pAttribs = &newAttribBuffer[0];
337
338 newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
339 newWorkDesc.numAttribs = workDesc.numAttribs;
340 newWorkDesc.triFlags = workDesc.triFlags;
341
342 // construct two tris by bloating point by point size
343 float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
344 float lowerX = x - halfPointSize;
345 float upperX = x + halfPointSize;
346 float lowerY = y - halfPointSize;
347 float upperY = y + halfPointSize;
348
349 // tri 0
350 float *pBuf = &newTriBuffer[0];
351 *pBuf++ = lowerX;
352 *pBuf++ = lowerX;
353 *pBuf++ = upperX;
354 pBuf++;
355 *pBuf++ = lowerY;
356 *pBuf++ = upperY;
357 *pBuf++ = upperY;
358 pBuf++;
359 _mm_store_ps(pBuf, _mm_set1_ps(z));
360 _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
361
362 // setup triangle rasterizer function
363 PFN_WORK_FUNC pfnTriRast;
364 // conservative rast not supported for points/lines
365 pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
366 SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
367
368 // overwrite texcoords for point sprites
369 if (isPointSpriteTexCoordEnabled)
370 {
371 // copy original attribs
372 memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
373 newWorkDesc.pAttribs = &newAttribBuffer[0];
374
375 // overwrite texcoord for point sprites
376 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
377 DWORD texCoordAttrib = 0;
378
379 while (_BitScanForward(&texCoordAttrib, texCoordMask))
380 {
381 texCoordMask &= ~(1 << texCoordAttrib);
382 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
383 if (rastState.pointSpriteTopOrigin)
384 {
385 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
386 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
387 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
388 }
389 else
390 {
391 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
392 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
393 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
394 }
395 }
396 }
397 else
398 {
399 // no texcoord overwrite, can reuse the attrib buffer from frontend
400 newWorkDesc.pAttribs = workDesc.pAttribs;
401 }
402
403 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
404
405 // tri 1
406 pBuf = &newTriBuffer[0];
407 *pBuf++ = lowerX;
408 *pBuf++ = upperX;
409 *pBuf++ = upperX;
410 pBuf++;
411 *pBuf++ = lowerY;
412 *pBuf++ = upperY;
413 *pBuf++ = lowerY;
414 // z, w unchanged
415
416 if (isPointSpriteTexCoordEnabled)
417 {
418 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
419 DWORD texCoordAttrib = 0;
420
421 while (_BitScanForward(&texCoordAttrib, texCoordMask))
422 {
423 texCoordMask &= ~(1 << texCoordAttrib);
424 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
425 if (rastState.pointSpriteTopOrigin)
426 {
427 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
428 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
429 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
430
431 }
432 else
433 {
434 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
435 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
436 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
437 }
438 }
439 }
440
441 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
442 }
443
444 void InitRasterizerFunctions()
445 {
446 InitRasterizerFuncs();
447 }
448
449 // Selector for correct templated RasterizeTriangle function
450 PFN_WORK_FUNC GetRasterizerFunc(
451 SWR_MULTISAMPLE_COUNT numSamples,
452 bool IsCenter,
453 bool IsConservative,
454 SWR_INPUT_COVERAGE InputCoverage,
455 uint32_t EdgeEnable,
456 bool RasterizeScissorEdges
457 )
458 {
459 SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
460 SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
461 SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
462
463 PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges];
464 SWR_ASSERT(func);
465
466 return func;
467 }