1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Macro Tile Manager which provides the facilities
26 * for threads to work on an macro tile.
28 ******************************************************************************/
29 #include <unordered_map>
32 #include "core/tilemgr.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
36 #define TILE_ID(x,y) ((x << 16 | y))
38 MacroTileMgr::MacroTileMgr(CachingArena
& arena
) : mArena(arena
)
42 void MacroTileMgr::enqueue(uint32_t x
, uint32_t y
, BE_WORK
*pWork
)
44 // Should not enqueue more then what we have backing for in the hot tile manager.
45 SWR_ASSERT(x
< KNOB_NUM_HOT_TILES_X
);
46 SWR_ASSERT(y
< KNOB_NUM_HOT_TILES_Y
);
48 if ((x
& ~(KNOB_NUM_HOT_TILES_X
-1)) | (y
& ~(KNOB_NUM_HOT_TILES_Y
-1)))
53 uint32_t id
= TILE_ID(x
, y
);
55 MacroTileQueue
&tile
= mTiles
[id
];
59 if (tile
.mWorkItemsFE
== 1)
62 mDirtyTiles
.push_back(&tile
);
66 tile
.enqueue_try_nosync(mArena
, pWork
);
69 void MacroTileMgr::markTileComplete(uint32_t id
)
71 SWR_ASSERT(mTiles
.find(id
) != mTiles
.end());
72 MacroTileQueue
&tile
= mTiles
[id
];
73 uint32_t numTiles
= tile
.mWorkItemsFE
;
74 InterlockedExchangeAdd(&mWorkItemsConsumed
, numTiles
);
77 tile
.mWorkItemsBE
+= numTiles
;
78 SWR_ASSERT(tile
.mWorkItemsFE
== tile
.mWorkItemsBE
);
80 // clear out tile, but defer fifo clear until the next DC first queues to it.
81 // this prevents worker threads from constantly locking a completed macro tile
82 tile
.mWorkItemsFE
= 0;
83 tile
.mWorkItemsBE
= 0;
86 HOTTILE
* HotTileMgr::GetHotTile(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t macroID
, SWR_RENDERTARGET_ATTACHMENT attachment
, bool create
, uint32_t numSamples
,
87 uint32_t renderTargetArrayIndex
)
90 MacroTileMgr::getTileIndices(macroID
, x
, y
);
92 SWR_ASSERT(x
< KNOB_NUM_HOT_TILES_X
);
93 SWR_ASSERT(y
< KNOB_NUM_HOT_TILES_Y
);
95 HotTileSet
&tile
= mHotTiles
[x
][y
];
96 HOTTILE
& hotTile
= tile
.Attachment
[attachment
];
97 if (hotTile
.pBuffer
== NULL
)
101 uint32_t size
= numSamples
* mHotTileSize
[attachment
];
102 uint32_t numaNode
= ((x
^ y
) & pContext
->threadPool
.numaMask
);
103 hotTile
.pBuffer
= (uint8_t*)AllocHotTileMem(size
, KNOB_SIMD_WIDTH
* 4, numaNode
);
104 hotTile
.state
= HOTTILE_INVALID
;
105 hotTile
.numSamples
= numSamples
;
106 hotTile
.renderTargetArrayIndex
= renderTargetArrayIndex
;
115 // free the old tile and create a new one with enough space to hold all samples
116 if (numSamples
> hotTile
.numSamples
)
118 // tile should be either uninitialized or resolved if we're deleting and switching to a
120 SWR_ASSERT((hotTile
.state
== HOTTILE_INVALID
) ||
121 (hotTile
.state
== HOTTILE_RESOLVED
) ||
122 (hotTile
.state
== HOTTILE_CLEAR
));
123 FreeHotTileMem(hotTile
.pBuffer
);
125 uint32_t size
= numSamples
* mHotTileSize
[attachment
];
126 uint32_t numaNode
= ((x
^ y
) & pContext
->threadPool
.numaMask
);
127 hotTile
.pBuffer
= (uint8_t*)AllocHotTileMem(size
, KNOB_SIMD_WIDTH
* 4, numaNode
);
128 hotTile
.state
= HOTTILE_INVALID
;
129 hotTile
.numSamples
= numSamples
;
132 // if requested render target array index isn't currently loaded, need to store out the current hottile
133 // and load the requested array slice
134 if (renderTargetArrayIndex
!= hotTile
.renderTargetArrayIndex
)
139 case SWR_ATTACHMENT_COLOR0
:
140 case SWR_ATTACHMENT_COLOR1
:
141 case SWR_ATTACHMENT_COLOR2
:
142 case SWR_ATTACHMENT_COLOR3
:
143 case SWR_ATTACHMENT_COLOR4
:
144 case SWR_ATTACHMENT_COLOR5
:
145 case SWR_ATTACHMENT_COLOR6
:
146 case SWR_ATTACHMENT_COLOR7
: format
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
147 case SWR_ATTACHMENT_DEPTH
: format
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
148 case SWR_ATTACHMENT_STENCIL
: format
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
149 default: SWR_INVALID("Unknown attachment: %d", attachment
); format
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
152 if (hotTile
.state
== HOTTILE_CLEAR
)
154 if (attachment
== SWR_ATTACHMENT_STENCIL
)
155 ClearStencilHotTile(&hotTile
);
156 else if (attachment
== SWR_ATTACHMENT_DEPTH
)
157 ClearDepthHotTile(&hotTile
);
159 ClearColorHotTile(&hotTile
);
161 hotTile
.state
= HOTTILE_DIRTY
;
164 if (hotTile
.state
== HOTTILE_DIRTY
)
166 pContext
->pfnStoreTile(GetPrivateState(pDC
), format
, attachment
,
167 x
* KNOB_MACROTILE_X_DIM
, y
* KNOB_MACROTILE_Y_DIM
, hotTile
.renderTargetArrayIndex
, hotTile
.pBuffer
);
170 pContext
->pfnLoadTile(GetPrivateState(pDC
), format
, attachment
,
171 x
* KNOB_MACROTILE_X_DIM
, y
* KNOB_MACROTILE_Y_DIM
, renderTargetArrayIndex
, hotTile
.pBuffer
);
173 hotTile
.renderTargetArrayIndex
= renderTargetArrayIndex
;
174 hotTile
.state
= HOTTILE_DIRTY
;
177 return &tile
.Attachment
[attachment
];
180 HOTTILE
* HotTileMgr::GetHotTileNoLoad(
181 SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t macroID
,
182 SWR_RENDERTARGET_ATTACHMENT attachment
, bool create
, uint32_t numSamples
)
185 MacroTileMgr::getTileIndices(macroID
, x
, y
);
187 SWR_ASSERT(x
< KNOB_NUM_HOT_TILES_X
);
188 SWR_ASSERT(y
< KNOB_NUM_HOT_TILES_Y
);
190 HotTileSet
&tile
= mHotTiles
[x
][y
];
191 HOTTILE
& hotTile
= tile
.Attachment
[attachment
];
192 if (hotTile
.pBuffer
== NULL
)
196 uint32_t size
= numSamples
* mHotTileSize
[attachment
];
197 hotTile
.pBuffer
= (uint8_t*)AlignedMalloc(size
, KNOB_SIMD_WIDTH
* 4);
198 hotTile
.state
= HOTTILE_INVALID
;
199 hotTile
.numSamples
= numSamples
;
200 hotTile
.renderTargetArrayIndex
= 0;
211 #if USE_8x2_TILE_BACKEND
212 void HotTileMgr::ClearColorHotTile(const HOTTILE
* pHotTile
) // clear a macro tile from float4 clear data.
214 // Load clear color into SIMD register...
215 float *pClearData
= (float *)(pHotTile
->clearData
);
216 simd16scalar valR
= _simd16_broadcast_ss(&pClearData
[0]);
217 simd16scalar valG
= _simd16_broadcast_ss(&pClearData
[1]);
218 simd16scalar valB
= _simd16_broadcast_ss(&pClearData
[2]);
219 simd16scalar valA
= _simd16_broadcast_ss(&pClearData
[3]);
221 float *pfBuf
= (float *)pHotTile
->pBuffer
;
222 uint32_t numSamples
= pHotTile
->numSamples
;
224 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
226 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
228 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD16_TILE_X_DIM
* SIMD16_TILE_Y_DIM
)
230 _simd16_store_ps(pfBuf
, valR
);
231 pfBuf
+= KNOB_SIMD16_WIDTH
;
233 _simd16_store_ps(pfBuf
, valG
);
234 pfBuf
+= KNOB_SIMD16_WIDTH
;
236 _simd16_store_ps(pfBuf
, valB
);
237 pfBuf
+= KNOB_SIMD16_WIDTH
;
239 _simd16_store_ps(pfBuf
, valA
);
240 pfBuf
+= KNOB_SIMD16_WIDTH
;
246 void HotTileMgr::ClearDepthHotTile(const HOTTILE
* pHotTile
) // clear a macro tile from float4 clear data.
248 // Load clear color into SIMD register...
249 float *pClearData
= (float *)(pHotTile
->clearData
);
250 simd16scalar valZ
= _simd16_broadcast_ss(&pClearData
[0]);
252 float *pfBuf
= (float *)pHotTile
->pBuffer
;
253 uint32_t numSamples
= pHotTile
->numSamples
;
255 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
257 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
259 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD16_TILE_X_DIM
* SIMD16_TILE_Y_DIM
)
261 _simd16_store_ps(pfBuf
, valZ
);
262 pfBuf
+= KNOB_SIMD16_WIDTH
;
268 void HotTileMgr::ClearStencilHotTile(const HOTTILE
* pHotTile
)
270 // convert from F32 to U8.
271 uint8_t clearVal
= (uint8_t)(pHotTile
->clearData
[0]);
272 //broadcast 32x into __m256i...
273 simd16scalari valS
= _simd16_set1_epi8(clearVal
);
275 simd16scalari
*pBuf
= (simd16scalari
*)pHotTile
->pBuffer
;
276 uint32_t numSamples
= pHotTile
->numSamples
;
278 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
280 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
282 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
283 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD16_TILE_X_DIM
* SIMD16_TILE_Y_DIM
* 4)
285 _simd16_store_si(pBuf
, valS
);
293 void HotTileMgr::ClearColorHotTile(const HOTTILE
* pHotTile
) // clear a macro tile from float4 clear data.
295 // Load clear color into SIMD register...
296 float *pClearData
= (float*)(pHotTile
->clearData
);
297 simdscalar valR
= _simd_broadcast_ss(&pClearData
[0]);
298 simdscalar valG
= _simd_broadcast_ss(&pClearData
[1]);
299 simdscalar valB
= _simd_broadcast_ss(&pClearData
[2]);
300 simdscalar valA
= _simd_broadcast_ss(&pClearData
[3]);
302 float *pfBuf
= (float*)pHotTile
->pBuffer
;
303 uint32_t numSamples
= pHotTile
->numSamples
;
305 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
307 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
309 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
311 _simd_store_ps(pfBuf
, valR
);
312 pfBuf
+= KNOB_SIMD_WIDTH
;
313 _simd_store_ps(pfBuf
, valG
);
314 pfBuf
+= KNOB_SIMD_WIDTH
;
315 _simd_store_ps(pfBuf
, valB
);
316 pfBuf
+= KNOB_SIMD_WIDTH
;
317 _simd_store_ps(pfBuf
, valA
);
318 pfBuf
+= KNOB_SIMD_WIDTH
;
324 void HotTileMgr::ClearDepthHotTile(const HOTTILE
* pHotTile
) // clear a macro tile from float4 clear data.
326 // Load clear color into SIMD register...
327 float *pClearData
= (float*)(pHotTile
->clearData
);
328 simdscalar valZ
= _simd_broadcast_ss(&pClearData
[0]);
330 float *pfBuf
= (float*)pHotTile
->pBuffer
;
331 uint32_t numSamples
= pHotTile
->numSamples
;
333 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
335 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
337 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
)
339 _simd_store_ps(pfBuf
, valZ
);
340 pfBuf
+= KNOB_SIMD_WIDTH
;
346 void HotTileMgr::ClearStencilHotTile(const HOTTILE
* pHotTile
)
348 // convert from F32 to U8.
349 uint8_t clearVal
= (uint8_t)(pHotTile
->clearData
[0]);
350 //broadcast 32x into __m256i...
351 simdscalari valS
= _simd_set1_epi8(clearVal
);
353 simdscalari
* pBuf
= (simdscalari
*)pHotTile
->pBuffer
;
354 uint32_t numSamples
= pHotTile
->numSamples
;
356 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
358 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
360 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
361 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
* 4)
363 _simd_store_si(pBuf
, valS
);
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief InitializeHotTiles
373 /// for draw calls, we initialize the active hot tiles and perform deferred
374 /// load on them if tile is in invalid state. we do this in the outer thread
375 /// loop instead of inside the draw routine itself mainly for performance,
376 /// to avoid unnecessary setup every triangle
377 /// @todo support deferred clear
378 /// @param pCreateInfo - pointer to creation info.
379 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t macroID
)
381 const API_STATE
& state
= GetApiState(pDC
);
384 MacroTileMgr::getTileIndices(macroID
, x
, y
);
385 x
*= KNOB_MACROTILE_X_DIM
;
386 y
*= KNOB_MACROTILE_Y_DIM
;
388 uint32_t numSamples
= GetNumSamples(state
.rastState
.sampleCount
);
390 // check RT if enabled
391 unsigned long rtSlot
= 0;
392 uint32_t colorHottileEnableMask
= state
.colorHottileEnable
;
393 while (_BitScanForward(&rtSlot
, colorHottileEnableMask
))
395 HOTTILE
* pHotTile
= GetHotTile(pContext
, pDC
, macroID
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), true, numSamples
);
397 if (pHotTile
->state
== HOTTILE_INVALID
)
399 AR_BEGIN(BELoadTiles
, pDC
->drawId
);
400 // invalid hottile before draw requires a load from surface before we can draw to it
401 pContext
->pfnLoadTile(GetPrivateState(pDC
), KNOB_COLOR_HOT_TILE_FORMAT
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), x
, y
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
402 pHotTile
->state
= HOTTILE_DIRTY
;
403 AR_END(BELoadTiles
, 0);
405 else if (pHotTile
->state
== HOTTILE_CLEAR
)
407 AR_BEGIN(BELoadTiles
, pDC
->drawId
);
409 ClearColorHotTile(pHotTile
);
410 pHotTile
->state
= HOTTILE_DIRTY
;
411 AR_END(BELoadTiles
, 0);
413 colorHottileEnableMask
&= ~(1 << rtSlot
);
416 // check depth if enabled
417 if (state
.depthHottileEnable
)
419 HOTTILE
* pHotTile
= GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
420 if (pHotTile
->state
== HOTTILE_INVALID
)
422 AR_BEGIN(BELoadTiles
, pDC
->drawId
);
423 // invalid hottile before draw requires a load from surface before we can draw to it
424 pContext
->pfnLoadTile(GetPrivateState(pDC
), KNOB_DEPTH_HOT_TILE_FORMAT
, SWR_ATTACHMENT_DEPTH
, x
, y
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
425 pHotTile
->state
= HOTTILE_DIRTY
;
426 AR_END(BELoadTiles
, 0);
428 else if (pHotTile
->state
== HOTTILE_CLEAR
)
430 AR_BEGIN(BELoadTiles
, pDC
->drawId
);
432 ClearDepthHotTile(pHotTile
);
433 pHotTile
->state
= HOTTILE_DIRTY
;
434 AR_END(BELoadTiles
, 0);
438 // check stencil if enabled
439 if (state
.stencilHottileEnable
)
441 HOTTILE
* pHotTile
= GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
442 if (pHotTile
->state
== HOTTILE_INVALID
)
444 AR_BEGIN(BELoadTiles
, pDC
->drawId
);
445 // invalid hottile before draw requires a load from surface before we can draw to it
446 pContext
->pfnLoadTile(GetPrivateState(pDC
), KNOB_STENCIL_HOT_TILE_FORMAT
, SWR_ATTACHMENT_STENCIL
, x
, y
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
447 pHotTile
->state
= HOTTILE_DIRTY
;
448 AR_END(BELoadTiles
, 0);
450 else if (pHotTile
->state
== HOTTILE_CLEAR
)
452 AR_BEGIN(BELoadTiles
, pDC
->drawId
);
454 ClearStencilHotTile(pHotTile
);
455 pHotTile
->state
= HOTTILE_DIRTY
;
456 AR_END(BELoadTiles
, 0);