swr: [rasterizer core] Arena optimizations - preparing for global allocator.
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / tilemgr.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file tilemgr.cpp
24 *
25 * @brief Implementation for Macro Tile Manager which provides the facilities
26 * for threads to work on an macro tile.
27 *
28 ******************************************************************************/
29 #include <unordered_map>
30
31 #include "fifo.hpp"
32 #include "core/tilemgr.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
35
36 #define TILE_ID(x,y) ((x << 16 | y))
37
38 // override new/delete for alignment
39 void *MacroTileMgr::operator new(size_t size)
40 {
41 return _aligned_malloc(size, 64);
42 }
43
44 void MacroTileMgr::operator delete(void *p)
45 {
46 _aligned_free(p);
47 }
48
49 void* DispatchQueue::operator new(size_t size)
50 {
51 return _aligned_malloc(size, 64);
52 }
53
54 void DispatchQueue::operator delete(void *p)
55 {
56 _aligned_free(p);
57 }
58
59 MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
60 {
61 }
62
63 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
64 {
65 // Should not enqueue more then what we have backing for in the hot tile manager.
66 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
67 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
68
69 if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
70 {
71 return;
72 }
73
74 uint32_t id = TILE_ID(x, y);
75
76 MacroTileQueue &tile = mTiles[id];
77 tile.mWorkItemsFE++;
78
79 if (tile.mWorkItemsFE == 1)
80 {
81 tile.clear(mArena);
82 mDirtyTiles.push_back(id);
83 }
84
85 mWorkItemsProduced++;
86 tile.enqueue_try_nosync(mArena, pWork);
87 }
88
89 void MacroTileMgr::markTileComplete(uint32_t id)
90 {
91 SWR_ASSERT(mTiles.find(id) != mTiles.end());
92 MacroTileQueue &tile = mTiles[id];
93 uint32_t numTiles = tile.mWorkItemsFE;
94 InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
95
96 _ReadWriteBarrier();
97 tile.mWorkItemsBE += numTiles;
98 SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
99
100 // clear out tile, but defer fifo clear until the next DC first queues to it.
101 // this prevents worker threads from constantly locking a completed macro tile
102 tile.mWorkItemsFE = 0;
103 tile.mWorkItemsBE = 0;
104 }
105
106 HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
107 uint32_t renderTargetArrayIndex)
108 {
109 uint32_t x, y;
110 MacroTileMgr::getTileIndices(macroID, x, y);
111
112 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
113 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
114
115 HotTileSet &tile = mHotTiles[x][y];
116 HOTTILE& hotTile = tile.Attachment[attachment];
117 if (hotTile.pBuffer == NULL)
118 {
119 if (create)
120 {
121 uint32_t size = numSamples * mHotTileSize[attachment];
122 hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
123 hotTile.state = HOTTILE_INVALID;
124 hotTile.numSamples = numSamples;
125 hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
126 }
127 else
128 {
129 return NULL;
130 }
131 }
132 else
133 {
134 // free the old tile and create a new one with enough space to hold all samples
135 if (numSamples > hotTile.numSamples)
136 {
137 // tile should be either uninitialized or resolved if we're deleting and switching to a
138 // new sample count
139 SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
140 (hotTile.state == HOTTILE_RESOLVED) ||
141 (hotTile.state == HOTTILE_CLEAR));
142 _aligned_free(hotTile.pBuffer);
143
144 uint32_t size = numSamples * mHotTileSize[attachment];
145 hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
146 hotTile.state = HOTTILE_INVALID;
147 hotTile.numSamples = numSamples;
148 }
149
150 // if requested render target array index isn't currently loaded, need to store out the current hottile
151 // and load the requested array slice
152 if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
153 {
154 SWR_FORMAT format;
155 switch (attachment)
156 {
157 case SWR_ATTACHMENT_COLOR0:
158 case SWR_ATTACHMENT_COLOR1:
159 case SWR_ATTACHMENT_COLOR2:
160 case SWR_ATTACHMENT_COLOR3:
161 case SWR_ATTACHMENT_COLOR4:
162 case SWR_ATTACHMENT_COLOR5:
163 case SWR_ATTACHMENT_COLOR6:
164 case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
165 case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
166 case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
167 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
168 }
169
170 if (hotTile.state == HOTTILE_DIRTY)
171 {
172 pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
173 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
174 }
175
176 pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
177 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
178
179 hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
180 hotTile.state = HOTTILE_DIRTY;
181 }
182 }
183 return &tile.Attachment[attachment];
184 }
185
186 HOTTILE* HotTileMgr::GetHotTileNoLoad(
187 SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
188 SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
189 {
190 uint32_t x, y;
191 MacroTileMgr::getTileIndices(macroID, x, y);
192
193 SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
194 SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
195
196 HotTileSet &tile = mHotTiles[x][y];
197 HOTTILE& hotTile = tile.Attachment[attachment];
198 if (hotTile.pBuffer == NULL)
199 {
200 if (create)
201 {
202 uint32_t size = numSamples * mHotTileSize[attachment];
203 hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
204 hotTile.state = HOTTILE_INVALID;
205 hotTile.numSamples = numSamples;
206 hotTile.renderTargetArrayIndex = 0;
207 }
208 else
209 {
210 return NULL;
211 }
212 }
213
214 return &hotTile;
215 }
216
217 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
218 {
219 // Load clear color into SIMD register...
220 float *pClearData = (float*)(pHotTile->clearData);
221 simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
222 simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
223 simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
224 simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
225
226 float *pfBuf = (float*)pHotTile->pBuffer;
227 uint32_t numSamples = pHotTile->numSamples;
228
229 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
230 {
231 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
232 {
233 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
234 {
235 _simd_store_ps(pfBuf, valR);
236 pfBuf += KNOB_SIMD_WIDTH;
237 _simd_store_ps(pfBuf, valG);
238 pfBuf += KNOB_SIMD_WIDTH;
239 _simd_store_ps(pfBuf, valB);
240 pfBuf += KNOB_SIMD_WIDTH;
241 _simd_store_ps(pfBuf, valA);
242 pfBuf += KNOB_SIMD_WIDTH;
243 }
244 }
245 }
246 }
247
248 void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
249 {
250 // Load clear color into SIMD register...
251 float *pClearData = (float*)(pHotTile->clearData);
252 simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
253
254 float *pfBuf = (float*)pHotTile->pBuffer;
255 uint32_t numSamples = pHotTile->numSamples;
256
257 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
258 {
259 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
260 {
261 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
262 {
263 _simd_store_ps(pfBuf, valZ);
264 pfBuf += KNOB_SIMD_WIDTH;
265 }
266 }
267 }
268 }
269
270 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
271 {
272 // convert from F32 to U8.
273 uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
274 //broadcast 32x into __m256i...
275 simdscalari valS = _simd_set1_epi8(clearVal);
276
277 simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
278 uint32_t numSamples = pHotTile->numSamples;
279
280 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
281 {
282 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
283 {
284 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
285 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
286 {
287 _simd_store_si(pBuf, valS);
288 pBuf += 1;
289 }
290 }
291 }
292 }
293
294 //////////////////////////////////////////////////////////////////////////
295 /// @brief InitializeHotTiles
296 /// for draw calls, we initialize the active hot tiles and perform deferred
297 /// load on them if tile is in invalid state. we do this in the outer thread
298 /// loop instead of inside the draw routine itself mainly for performance,
299 /// to avoid unnecessary setup every triangle
300 /// @todo support deferred clear
301 /// @param pCreateInfo - pointer to creation info.
302 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
303 {
304 const API_STATE& state = GetApiState(pDC);
305 HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
306
307 uint32_t x, y;
308 MacroTileMgr::getTileIndices(macroID, x, y);
309 x *= KNOB_MACROTILE_X_DIM;
310 y *= KNOB_MACROTILE_Y_DIM;
311
312 uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
313
314 // check RT if enabled
315 unsigned long rtSlot = 0;
316 uint32_t colorHottileEnableMask = state.colorHottileEnable;
317 while (_BitScanForward(&rtSlot, colorHottileEnableMask))
318 {
319 HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
320
321 if (pHotTile->state == HOTTILE_INVALID)
322 {
323 RDTSC_START(BELoadTiles);
324 // invalid hottile before draw requires a load from surface before we can draw to it
325 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
326 pHotTile->state = HOTTILE_DIRTY;
327 RDTSC_STOP(BELoadTiles, 0, 0);
328 }
329 else if (pHotTile->state == HOTTILE_CLEAR)
330 {
331 RDTSC_START(BELoadTiles);
332 // Clear the tile.
333 ClearColorHotTile(pHotTile);
334 pHotTile->state = HOTTILE_DIRTY;
335 RDTSC_STOP(BELoadTiles, 0, 0);
336 }
337 colorHottileEnableMask &= ~(1 << rtSlot);
338 }
339
340 // check depth if enabled
341 if (state.depthHottileEnable)
342 {
343 HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
344 if (pHotTile->state == HOTTILE_INVALID)
345 {
346 RDTSC_START(BELoadTiles);
347 // invalid hottile before draw requires a load from surface before we can draw to it
348 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
349 pHotTile->state = HOTTILE_DIRTY;
350 RDTSC_STOP(BELoadTiles, 0, 0);
351 }
352 else if (pHotTile->state == HOTTILE_CLEAR)
353 {
354 RDTSC_START(BELoadTiles);
355 // Clear the tile.
356 ClearDepthHotTile(pHotTile);
357 pHotTile->state = HOTTILE_DIRTY;
358 RDTSC_STOP(BELoadTiles, 0, 0);
359 }
360 }
361
362 // check stencil if enabled
363 if (state.stencilHottileEnable)
364 {
365 HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
366 if (pHotTile->state == HOTTILE_INVALID)
367 {
368 RDTSC_START(BELoadTiles);
369 // invalid hottile before draw requires a load from surface before we can draw to it
370 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
371 pHotTile->state = HOTTILE_DIRTY;
372 RDTSC_STOP(BELoadTiles, 0, 0);
373 }
374 else if (pHotTile->state == HOTTILE_CLEAR)
375 {
376 RDTSC_START(BELoadTiles);
377 // Clear the tile.
378 ClearStencilHotTile(pHotTile);
379 pHotTile->state = HOTTILE_DIRTY;
380 RDTSC_STOP(BELoadTiles, 0, 0);
381 }
382 }
383 }