swr/rast: Optimize late/bindless JIT of samplers
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / tilemgr.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file tilemgr.cpp
24 *
25 * @brief Implementation for Macro Tile Manager which provides the facilities
26 * for threads to work on an macro tile.
27 *
28 ******************************************************************************/
29 #include <unordered_map>
30
31 #include "fifo.hpp"
32 #include "core/tilemgr.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
35
36 #define TILE_ID(x,y) ((x << 16 | y))
37
38 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
39 {
40 }
41
42 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
43 {
44 // Should not enqueue more then what we have backing for in the hot tile manager.
45 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
46 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
47
48 if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
49 {
50 return;
51 }
52
53 uint32_t id = TILE_ID(x, y);
54
55 MacroTileQueue &tile = mTiles[id];
56 tile.mWorkItemsFE++;
57 tile.mId = id;
58
59 if (tile.mWorkItemsFE == 1)
60 {
61 tile.clear(mArena);
62 mDirtyTiles.push_back(&tile);
63 }
64
65 mWorkItemsProduced++;
66 tile.enqueue_try_nosync(mArena, pWork);
67 }
68
69 void MacroTileMgr::markTileComplete(uint32_t id)
70 {
71 SWR_ASSERT(mTiles.find(id) != mTiles.end());
72 MacroTileQueue &tile = mTiles[id];
73 uint32_t numTiles = tile.mWorkItemsFE;
74 InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
75
76 _ReadWriteBarrier();
77 tile.mWorkItemsBE += numTiles;
78 SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
79
80 // clear out tile, but defer fifo clear until the next DC first queues to it.
81 // this prevents worker threads from constantly locking a completed macro tile
82 tile.mWorkItemsFE = 0;
83 tile.mWorkItemsBE = 0;
84 }
85
86 HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
87 uint32_t renderTargetArrayIndex)
88 {
89 uint32_t x, y;
90 MacroTileMgr::getTileIndices(macroID, x, y);
91
92 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
93 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
94
95 HotTileSet &tile = mHotTiles[x][y];
96 HOTTILE& hotTile = tile.Attachment[attachment];
97 if (hotTile.pBuffer == NULL)
98 {
99 if (create)
100 {
101 uint32_t size = numSamples * mHotTileSize[attachment];
102 uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
103 hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
104 hotTile.state = HOTTILE_INVALID;
105 hotTile.numSamples = numSamples;
106 hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
107 }
108 else
109 {
110 return NULL;
111 }
112 }
113 else
114 {
115 // free the old tile and create a new one with enough space to hold all samples
116 if (numSamples > hotTile.numSamples)
117 {
118 // tile should be either uninitialized or resolved if we're deleting and switching to a
119 // new sample count
120 SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
121 (hotTile.state == HOTTILE_RESOLVED) ||
122 (hotTile.state == HOTTILE_CLEAR));
123 FreeHotTileMem(hotTile.pBuffer);
124
125 uint32_t size = numSamples * mHotTileSize[attachment];
126 uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
127 hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
128 hotTile.state = HOTTILE_INVALID;
129 hotTile.numSamples = numSamples;
130 }
131
132 // if requested render target array index isn't currently loaded, need to store out the current hottile
133 // and load the requested array slice
134 if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
135 {
136 SWR_FORMAT format;
137 switch (attachment)
138 {
139 case SWR_ATTACHMENT_COLOR0:
140 case SWR_ATTACHMENT_COLOR1:
141 case SWR_ATTACHMENT_COLOR2:
142 case SWR_ATTACHMENT_COLOR3:
143 case SWR_ATTACHMENT_COLOR4:
144 case SWR_ATTACHMENT_COLOR5:
145 case SWR_ATTACHMENT_COLOR6:
146 case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
147 case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
148 case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
149 default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
150 }
151
152 if (hotTile.state == HOTTILE_CLEAR)
153 {
154 if (attachment == SWR_ATTACHMENT_STENCIL)
155 ClearStencilHotTile(&hotTile);
156 else if (attachment == SWR_ATTACHMENT_DEPTH)
157 ClearDepthHotTile(&hotTile);
158 else
159 ClearColorHotTile(&hotTile);
160
161 hotTile.state = HOTTILE_DIRTY;
162 }
163
164 if (hotTile.state == HOTTILE_DIRTY)
165 {
166 pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment,
167 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
168 }
169
170 pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment,
171 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
172
173 hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
174 hotTile.state = HOTTILE_DIRTY;
175 }
176 }
177 return &tile.Attachment[attachment];
178 }
179
180 HOTTILE* HotTileMgr::GetHotTileNoLoad(
181 SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
182 SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
183 {
184 uint32_t x, y;
185 MacroTileMgr::getTileIndices(macroID, x, y);
186
187 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
188 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
189
190 HotTileSet &tile = mHotTiles[x][y];
191 HOTTILE& hotTile = tile.Attachment[attachment];
192 if (hotTile.pBuffer == NULL)
193 {
194 if (create)
195 {
196 uint32_t size = numSamples * mHotTileSize[attachment];
197 hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64);
198 hotTile.state = HOTTILE_INVALID;
199 hotTile.numSamples = numSamples;
200 hotTile.renderTargetArrayIndex = 0;
201 }
202 else
203 {
204 return NULL;
205 }
206 }
207
208 return &hotTile;
209 }
210
211 #if USE_8x2_TILE_BACKEND
212 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
213 {
214 // Load clear color into SIMD register...
215 float *pClearData = (float *)(pHotTile->clearData);
216 simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]);
217 simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]);
218 simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]);
219 simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]);
220
221 float *pfBuf = (float *)pHotTile->pBuffer;
222 uint32_t numSamples = pHotTile->numSamples;
223
224 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
225 {
226 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
227 {
228 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
229 {
230 _simd16_store_ps(pfBuf, valR);
231 pfBuf += KNOB_SIMD16_WIDTH;
232
233 _simd16_store_ps(pfBuf, valG);
234 pfBuf += KNOB_SIMD16_WIDTH;
235
236 _simd16_store_ps(pfBuf, valB);
237 pfBuf += KNOB_SIMD16_WIDTH;
238
239 _simd16_store_ps(pfBuf, valA);
240 pfBuf += KNOB_SIMD16_WIDTH;
241 }
242 }
243 }
244 }
245
246 void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
247 {
248 // Load clear color into SIMD register...
249 float *pClearData = (float *)(pHotTile->clearData);
250 simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]);
251
252 float *pfBuf = (float *)pHotTile->pBuffer;
253 uint32_t numSamples = pHotTile->numSamples;
254
255 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
256 {
257 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
258 {
259 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
260 {
261 _simd16_store_ps(pfBuf, valZ);
262 pfBuf += KNOB_SIMD16_WIDTH;
263 }
264 }
265 }
266 }
267
268 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
269 {
270 // convert from F32 to U8.
271 uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
272 //broadcast 32x into __m256i...
273 simd16scalari valS = _simd16_set1_epi8(clearVal);
274
275 simd16scalari *pBuf = (simd16scalari *)pHotTile->pBuffer;
276 uint32_t numSamples = pHotTile->numSamples;
277
278 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
279 {
280 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
281 {
282 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
283 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
284 {
285 _simd16_store_si(pBuf, valS);
286 pBuf += 1;
287 }
288 }
289 }
290 }
291
292 #else
293 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
294 {
295 // Load clear color into SIMD register...
296 float *pClearData = (float*)(pHotTile->clearData);
297 simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
298 simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
299 simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
300 simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
301
302 float *pfBuf = (float*)pHotTile->pBuffer;
303 uint32_t numSamples = pHotTile->numSamples;
304
305 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
306 {
307 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
308 {
309 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
310 {
311 _simd_store_ps(pfBuf, valR);
312 pfBuf += KNOB_SIMD_WIDTH;
313 _simd_store_ps(pfBuf, valG);
314 pfBuf += KNOB_SIMD_WIDTH;
315 _simd_store_ps(pfBuf, valB);
316 pfBuf += KNOB_SIMD_WIDTH;
317 _simd_store_ps(pfBuf, valA);
318 pfBuf += KNOB_SIMD_WIDTH;
319 }
320 }
321 }
322 }
323
324 void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
325 {
326 // Load clear color into SIMD register...
327 float *pClearData = (float*)(pHotTile->clearData);
328 simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
329
330 float *pfBuf = (float*)pHotTile->pBuffer;
331 uint32_t numSamples = pHotTile->numSamples;
332
333 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
334 {
335 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
336 {
337 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
338 {
339 _simd_store_ps(pfBuf, valZ);
340 pfBuf += KNOB_SIMD_WIDTH;
341 }
342 }
343 }
344 }
345
346 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
347 {
348 // convert from F32 to U8.
349 uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
350 //broadcast 32x into __m256i...
351 simdscalari valS = _simd_set1_epi8(clearVal);
352
353 simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
354 uint32_t numSamples = pHotTile->numSamples;
355
356 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
357 {
358 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
359 {
360 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
361 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
362 {
363 _simd_store_si(pBuf, valS);
364 pBuf += 1;
365 }
366 }
367 }
368 }
369
370 #endif
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief InitializeHotTiles
373 /// for draw calls, we initialize the active hot tiles and perform deferred
374 /// load on them if tile is in invalid state. we do this in the outer thread
375 /// loop instead of inside the draw routine itself mainly for performance,
376 /// to avoid unnecessary setup every triangle
377 /// @todo support deferred clear
378 /// @param pCreateInfo - pointer to creation info.
379 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID)
380 {
381 const API_STATE& state = GetApiState(pDC);
382 HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
383
384 uint32_t x, y;
385 MacroTileMgr::getTileIndices(macroID, x, y);
386 x *= KNOB_MACROTILE_X_DIM;
387 y *= KNOB_MACROTILE_Y_DIM;
388
389 uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
390
391 // check RT if enabled
392 unsigned long rtSlot = 0;
393 uint32_t colorHottileEnableMask = state.colorHottileEnable;
394 while (_BitScanForward(&rtSlot, colorHottileEnableMask))
395 {
396 HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
397
398 if (pHotTile->state == HOTTILE_INVALID)
399 {
400 RDTSC_BEGIN(BELoadTiles, pDC->drawId);
401 // invalid hottile before draw requires a load from surface before we can draw to it
402 pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
403 pHotTile->state = HOTTILE_DIRTY;
404 RDTSC_END(BELoadTiles, 0);
405 }
406 else if (pHotTile->state == HOTTILE_CLEAR)
407 {
408 RDTSC_BEGIN(BELoadTiles, pDC->drawId);
409 // Clear the tile.
410 ClearColorHotTile(pHotTile);
411 pHotTile->state = HOTTILE_DIRTY;
412 RDTSC_END(BELoadTiles, 0);
413 }
414 colorHottileEnableMask &= ~(1 << rtSlot);
415 }
416
417 // check depth if enabled
418 if (state.depthHottileEnable)
419 {
420 HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
421 if (pHotTile->state == HOTTILE_INVALID)
422 {
423 RDTSC_BEGIN(BELoadTiles, pDC->drawId);
424 // invalid hottile before draw requires a load from surface before we can draw to it
425 pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
426 pHotTile->state = HOTTILE_DIRTY;
427 RDTSC_END(BELoadTiles, 0);
428 }
429 else if (pHotTile->state == HOTTILE_CLEAR)
430 {
431 RDTSC_BEGIN(BELoadTiles, pDC->drawId);
432 // Clear the tile.
433 ClearDepthHotTile(pHotTile);
434 pHotTile->state = HOTTILE_DIRTY;
435 RDTSC_END(BELoadTiles, 0);
436 }
437 }
438
439 // check stencil if enabled
440 if (state.stencilHottileEnable)
441 {
442 HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
443 if (pHotTile->state == HOTTILE_INVALID)
444 {
445 RDTSC_BEGIN(BELoadTiles, pDC->drawId);
446 // invalid hottile before draw requires a load from surface before we can draw to it
447 pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
448 pHotTile->state = HOTTILE_DIRTY;
449 RDTSC_END(BELoadTiles, 0);
450 }
451 else if (pHotTile->state == HOTTILE_CLEAR)
452 {
453 RDTSC_BEGIN(BELoadTiles, pDC->drawId);
454 // Clear the tile.
455 ClearStencilHotTile(pHotTile);
456 pHotTile->state = HOTTILE_DIRTY;
457 RDTSC_END(BELoadTiles, 0);
458 }
459 }
460 }