swr/rast: Implement VROUND intrinsic in x86 lowering pass
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / rasterizer.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file rasterizer.cpp
24 *
25 * @brief Implementation for the rasterizer.
26 *
27 ******************************************************************************/
28
29 #include <vector>
30 #include <algorithm>
31
32 #include "rasterizer.h"
33 #include "backends/gen_rasterizer.hpp"
34 #include "rdtsc_core.h"
35 #include "backend.h"
36 #include "utils.h"
37 #include "frontend.h"
38 #include "tilemgr.h"
39 #include "memory/tilingtraits.h"
40 #include "rasterizer_impl.h"
41
42 PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
43
44 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
45 {
46 const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
47 #if KNOB_ENABLE_TOSS_POINTS
48 if (KNOB_TOSS_BIN_TRIS)
49 {
50 return;
51 }
52 #endif
53
54 // bloat line to two tris and call the triangle rasterizer twice
55 RDTSC_BEGIN(BERasterizeLine, pDC->drawId);
56
57 const API_STATE &state = GetApiState(pDC);
58 const SWR_RASTSTATE &rastState = state.rastState;
59
60 // macrotile dimensioning
61 uint32_t macroX, macroY;
62 MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
63 int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
64 int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
65 int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
66 int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
67
68 const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
69
70 // create a copy of the triangle buffer to write our adjusted vertices to
71 OSALIGNSIMD(float) newTriBuffer[4 * 4];
72 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
73 newWorkDesc.pTriBuffer = &newTriBuffer[0];
74
75 // create a copy of the attrib buffer to write our adjusted attribs to
76 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
77 newWorkDesc.pAttribs = &newAttribBuffer[0];
78
79 const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
80 const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
81
82 __m128 vX, vY, vZ, vRecipW;
83
84 vX = _mm_load_ps(workDesc.pTriBuffer);
85 vY = _mm_load_ps(workDesc.pTriBuffer + 4);
86 vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
87 vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
88
89 // triangle 0
90 // v0,v1 -> v0,v0,v1
91 __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
92 __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
93 __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
94 __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
95
96 __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
97 __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
98 if (workDesc.triFlags.yMajor)
99 {
100 vXa = _mm_add_ps(vAdjust, vXa);
101 }
102 else
103 {
104 vYa = _mm_add_ps(vAdjust, vYa);
105 }
106
107 // Store triangle description for rasterizer
108 _mm_store_ps((float*)&newTriBuffer[0], vXa);
109 _mm_store_ps((float*)&newTriBuffer[4], vYa);
110 _mm_store_ps((float*)&newTriBuffer[8], vZa);
111 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
112
113 // binner bins 3 edges for lines as v0, v1, v1
114 // tri0 needs v0, v0, v1
115 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
116 {
117 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
118 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
119
120 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
121 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
122 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
123 }
124
125 // Store user clip distances for triangle 0
126 float newClipBuffer[3 * 8];
127 uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
128 if (numClipDist)
129 {
130 newWorkDesc.pUserClipBuffer = newClipBuffer;
131
132 float* pOldBuffer = workDesc.pUserClipBuffer;
133 float* pNewBuffer = newClipBuffer;
134 for (uint32_t i = 0; i < numClipDist; ++i)
135 {
136 // read barycentric coeffs from binner
137 float a = *(pOldBuffer++);
138 float b = *(pOldBuffer++);
139
140 // reconstruct original clip distance at vertices
141 float c0 = a + b;
142 float c1 = b;
143
144 // construct triangle barycentrics
145 *(pNewBuffer++) = c0 - c1;
146 *(pNewBuffer++) = c0 - c1;
147 *(pNewBuffer++) = c1;
148 }
149 }
150
151 // setup triangle rasterizer function
152 PFN_WORK_FUNC pfnTriRast;
153 // conservative rast not supported for points/lines
154 pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
155 SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
156
157 // make sure this macrotile intersects the triangle
158 __m128i vXai = fpToFixedPoint(vXa);
159 __m128i vYai = fpToFixedPoint(vYa);
160 OSALIGNSIMD(SWR_RECT) bboxA;
161 calcBoundingBoxInt(vXai, vYai, bboxA);
162
163 if (!(bboxA.xmin > macroBoxRight ||
164 bboxA.xmin > scissorInFixedPoint.xmax ||
165 bboxA.xmax - 1 < macroBoxLeft ||
166 bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
167 bboxA.ymin > macroBoxBottom ||
168 bboxA.ymin > scissorInFixedPoint.ymax ||
169 bboxA.ymax - 1 < macroBoxTop ||
170 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
171 // rasterize triangle
172 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
173 }
174
175 // triangle 1
176 // v0,v1 -> v1,v1,v0
177 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
178 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
179 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
180 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
181
182 vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
183 if (workDesc.triFlags.yMajor)
184 {
185 vXa = _mm_add_ps(vAdjust, vXa);
186 }
187 else
188 {
189 vYa = _mm_add_ps(vAdjust, vYa);
190 }
191
192 // Store triangle description for rasterizer
193 _mm_store_ps((float*)&newTriBuffer[0], vXa);
194 _mm_store_ps((float*)&newTriBuffer[4], vYa);
195 _mm_store_ps((float*)&newTriBuffer[8], vZa);
196 _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
197
198 // binner bins 3 edges for lines as v0, v1, v1
199 // tri1 needs v1, v1, v0
200 for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
201 {
202 __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
203 __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
204
205 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
206 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
207 _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
208 }
209
210 // store user clip distance for triangle 1
211 if (numClipDist)
212 {
213 float* pOldBuffer = workDesc.pUserClipBuffer;
214 float* pNewBuffer = newClipBuffer;
215 for (uint32_t i = 0; i < numClipDist; ++i)
216 {
217 // read barycentric coeffs from binner
218 float a = *(pOldBuffer++);
219 float b = *(pOldBuffer++);
220
221 // reconstruct original clip distance at vertices
222 float c0 = a + b;
223 float c1 = b;
224
225 // construct triangle barycentrics
226 *(pNewBuffer++) = c1 - c0;
227 *(pNewBuffer++) = c1 - c0;
228 *(pNewBuffer++) = c0;
229 }
230 }
231
232 vXai = fpToFixedPoint(vXa);
233 vYai = fpToFixedPoint(vYa);
234 calcBoundingBoxInt(vXai, vYai, bboxA);
235
236 if (!(bboxA.xmin > macroBoxRight ||
237 bboxA.xmin > scissorInFixedPoint.xmax ||
238 bboxA.xmax - 1 < macroBoxLeft ||
239 bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
240 bboxA.ymin > macroBoxBottom ||
241 bboxA.ymin > scissorInFixedPoint.ymax ||
242 bboxA.ymax - 1 < macroBoxTop ||
243 bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
244 // rasterize triangle
245 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
246 }
247
248 RDTSC_BEGIN(BERasterizeLine, 1);
249 }
250
251 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
252 {
253 #if KNOB_ENABLE_TOSS_POINTS
254 if (KNOB_TOSS_BIN_TRIS)
255 {
256 return;
257 }
258 #endif
259
260 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
261 const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
262
263 // map x,y relative offsets from start of raster tile to bit position in
264 // coverage mask for the point
265 static const uint32_t coverageMap[8][8] = {
266 { 0, 1, 4, 5, 8, 9, 12, 13 },
267 { 2, 3, 6, 7, 10, 11, 14, 15 },
268 { 16, 17, 20, 21, 24, 25, 28, 29 },
269 { 18, 19, 22, 23, 26, 27, 30, 31 },
270 { 32, 33, 36, 37, 40, 41, 44, 45 },
271 { 34, 35, 38, 39, 42, 43, 46, 47 },
272 { 48, 49, 52, 53, 56, 57, 60, 61 },
273 { 50, 51, 54, 55, 58, 59, 62, 63 }
274 };
275
276 OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
277
278 // pull point information from triangle buffer
279 // @todo use structs for readability
280 uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
281 uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
282 float z = *(workDesc.pTriBuffer + 2);
283
284 // construct triangle descriptor for point
285 // no interpolation, set up i,j for constant interpolation of z and attribs
286 // @todo implement an optimized backend that doesn't require triangle information
287
288 // compute coverage mask from x,y packed into the coverageMask flag
289 // mask indices by the maximum valid index for x/y of coveragemap.
290 uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
291 uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
292 // todo: multisample points?
293 triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
294
295 // no persp divide needed for points
296 triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
297 triDesc.triFlags = workDesc.triFlags;
298 triDesc.recipDet = 1.0f;
299 triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
300 triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
301 triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
302 triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
303
304 RenderOutputBuffers renderBuffers;
305 GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
306 renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
307
308 RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
309 backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
310 RDTSC_END(BEPixelBackend, 0);
311 }
312
313 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
314 {
315 const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
316 const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
317 const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
318
319 bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
320
321 // load point vertex
322 float x = *workDesc.pTriBuffer;
323 float y = *(workDesc.pTriBuffer + 1);
324 float z = *(workDesc.pTriBuffer + 2);
325
326 // create a copy of the triangle buffer to write our adjusted vertices to
327 OSALIGNSIMD(float) newTriBuffer[4 * 4];
328 TRIANGLE_WORK_DESC newWorkDesc = workDesc;
329 newWorkDesc.pTriBuffer = &newTriBuffer[0];
330
331 // create a copy of the attrib buffer to write our adjusted attribs to
332 OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
333 newWorkDesc.pAttribs = &newAttribBuffer[0];
334
335 newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
336 newWorkDesc.numAttribs = workDesc.numAttribs;
337 newWorkDesc.triFlags = workDesc.triFlags;
338
339 // construct two tris by bloating point by point size
340 float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
341 float lowerX = x - halfPointSize;
342 float upperX = x + halfPointSize;
343 float lowerY = y - halfPointSize;
344 float upperY = y + halfPointSize;
345
346 // tri 0
347 float *pBuf = &newTriBuffer[0];
348 *pBuf++ = lowerX;
349 *pBuf++ = lowerX;
350 *pBuf++ = upperX;
351 pBuf++;
352 *pBuf++ = lowerY;
353 *pBuf++ = upperY;
354 *pBuf++ = upperY;
355 pBuf++;
356 _mm_store_ps(pBuf, _mm_set1_ps(z));
357 _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
358
359 // setup triangle rasterizer function
360 PFN_WORK_FUNC pfnTriRast;
361 // conservative rast not supported for points/lines
362 pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
363 SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
364
365 // overwrite texcoords for point sprites
366 if (isPointSpriteTexCoordEnabled)
367 {
368 // copy original attribs
369 memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
370 newWorkDesc.pAttribs = &newAttribBuffer[0];
371
372 // overwrite texcoord for point sprites
373 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
374 DWORD texCoordAttrib = 0;
375
376 while (_BitScanForward(&texCoordAttrib, texCoordMask))
377 {
378 texCoordMask &= ~(1 << texCoordAttrib);
379 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
380 if (rastState.pointSpriteTopOrigin)
381 {
382 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
383 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
384 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
385 }
386 else
387 {
388 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
389 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
390 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
391 }
392 }
393 }
394 else
395 {
396 // no texcoord overwrite, can reuse the attrib buffer from frontend
397 newWorkDesc.pAttribs = workDesc.pAttribs;
398 }
399
400 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
401
402 // tri 1
403 pBuf = &newTriBuffer[0];
404 *pBuf++ = lowerX;
405 *pBuf++ = upperX;
406 *pBuf++ = upperX;
407 pBuf++;
408 *pBuf++ = lowerY;
409 *pBuf++ = upperY;
410 *pBuf++ = lowerY;
411 // z, w unchanged
412
413 if (isPointSpriteTexCoordEnabled)
414 {
415 uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
416 DWORD texCoordAttrib = 0;
417
418 while (_BitScanForward(&texCoordAttrib, texCoordMask))
419 {
420 texCoordMask &= ~(1 << texCoordAttrib);
421 __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
422 if (rastState.pointSpriteTopOrigin)
423 {
424 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
425 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
426 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
427
428 }
429 else
430 {
431 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
432 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
433 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
434 }
435 }
436 }
437
438 pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
439 }
440
441 void InitRasterizerFunctions()
442 {
443 InitRasterizerFuncs();
444 }
445
446 // Selector for correct templated RasterizeTriangle function
447 PFN_WORK_FUNC GetRasterizerFunc(
448 SWR_MULTISAMPLE_COUNT numSamples,
449 bool IsCenter,
450 bool IsConservative,
451 SWR_INPUT_COVERAGE InputCoverage,
452 uint32_t EdgeEnable,
453 bool RasterizeScissorEdges
454 )
455 {
456 SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
457 SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
458 SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
459
460 PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges];
461 SWR_ASSERT(func);
462
463 return func;
464 }