pContext->driverType = pCreateInfo->driver;
pContext->privateStateSize = pCreateInfo->privateStateSize;
- pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
- pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
pContext->numSubContexts = pCreateInfo->maxSubContexts;
if (pContext->numSubContexts > 1)
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
{
pContext->dcRing[dc].pArena = new Arena();
- pContext->dcRing[dc].inUse = false;
pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
}
- pContext->nextDrawId = 1;
- pContext->DrawEnqueued = 1;
-
// State setup AFTER context is fully initialized
SetupDefaultState(pContext);
_aligned_free(pContext->pScratch[i]);
}
- _aligned_free(pContext->dcRing);
- _aligned_free(pContext->dsRing);
_aligned_free(pContext->subCtxSave);
delete(pContext->pHotTileMgr);
pContext->FifosNotEmpty.notify_all();
}
-bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
+template<bool IsDraw>
+void QueueWork(SWR_CONTEXT *pContext)
{
- // For single thread nothing should still be drawing.
- if (KNOB_SINGLE_THREADED) { return false; }
-
- if (pDC->isCompute)
+ if (IsDraw)
{
- if (pDC->doneCompute)
- {
- pDC->inUse = false;
- return false;
- }
+ // Each worker thread looks at a DC for both FE and BE work at different times and so we
+ // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
+ // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+ // then moved on if all work is done.)
+ pContext->pCurDrawContext->threadsDone =
+ pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
}
-
- // Check if backend work is done. First make sure all triangles have been binned.
- if (pDC->doneFE == true)
+ else
{
- // ensure workers have all moved passed this draw
- if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- pDC->inUse = false; // all work is done.
+ pContext->pCurDrawContext->threadsDone =
+ pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
}
- return pDC->inUse;
-}
-
-void QueueDraw(SWR_CONTEXT *pContext)
-{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
-
_ReadWriteBarrier();
{
std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
+ pContext->dcRing.Enqueue();
}
if (KNOB_SINGLE_THREADED)
uint32_t mxcsr = _mm_getcsr();
_mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
- std::unordered_set<uint32_t> lockedTiles;
- uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
- WorkOnFifoFE(pContext, 0, curDraw[0], 0);
- WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+ if (IsDraw)
+ {
+ std::unordered_set<uint32_t> lockedTiles;
+ uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+ WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+ WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+ }
+ else
+ {
+ uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+ WorkOnCompute(pContext, 0, curDispatch);
+ }
+
+ // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
+ if (!pContext->dcRing.IsEmpty())
+ {
+ pContext->dcRing.Dequeue();
+ }
// restore csr
_mm_setcsr(mxcsr);
pContext->pCurDrawContext = nullptr;
}
-///@todo Combine this with QueueDraw
-void QueueDispatch(SWR_CONTEXT *pContext)
+INLINE void QueueDraw(SWR_CONTEXT* pContext)
{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
-
- _ReadWriteBarrier();
- {
- std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
- }
-
- if (KNOB_SINGLE_THREADED)
- {
- // flush denormals to 0
- uint32_t mxcsr = _mm_getcsr();
- _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-
- uint64_t curDispatch = pContext->pCurDrawContext->drawId;
- WorkOnCompute(pContext, 0, curDispatch);
-
- // restore csr
- _mm_setcsr(mxcsr);
- }
- else
- {
- RDTSC_START(APIDrawWakeAllThreads);
- WakeAllThreads(pContext);
- RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
- }
+ QueueWork<true>(pContext);
+}
- // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
- pContext->pPrevDrawContext = pContext->pCurDrawContext;
- pContext->pCurDrawContext = nullptr;
+INLINE void QueueDispatch(SWR_CONTEXT* pContext)
+{
+ QueueWork<false>(pContext);
}
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
// If current draw context is null then need to obtain a new draw context to use from ring.
if (pContext->pCurDrawContext == nullptr)
{
- uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-
- DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
- pContext->pCurDrawContext = pCurDrawContext;
-
- // Need to wait until this draw context is available to use.
- while (StillDrawing(pContext, pCurDrawContext))
+ // Need to wait for a free entry.
+ while (pContext->dcRing.IsFull())
{
_mm_pause();
}
+ uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+ pContext->pCurDrawContext = pCurDrawContext;
+
// Assign next available entry in DS ring to this DC.
uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
pCurDrawContext->pState = &pContext->dsRing[dsIndex];
pCurDrawContext->pArena->Reset();
pCurDrawContext->pContext = pContext;
pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
- pCurDrawContext->inUse = false;
- pCurDrawContext->doneCompute = false;
pCurDrawContext->doneFE = false;
pCurDrawContext->FeLock = 0;
- pCurDrawContext->threadsDoneFE = 0;
- pCurDrawContext->threadsDoneBE = 0;
+ pCurDrawContext->threadsDone = 0;
pCurDrawContext->pTileMgr->initialize();
// Assign unique drawId for this DC
- pCurDrawContext->drawId = pContext->nextDrawId++;
+ pCurDrawContext->drawId = pContext->dcRing.GetHead();
}
else
{
SWR_CONTEXT *pContext = GetContext(hContext);
RDTSC_START(APIWaitForIdle);
- // Wait for all work to complete.
- for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
- {
- DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
- while (StillDrawing(pContext, pDC))
- {
- _mm_pause();
- }
+ while (!pContext->dcRing.IsEmpty())
+ {
+ _mm_pause();
}
+
RDTSC_STOP(APIWaitForIdle, 1, 0);
}
#include "core/knobs.h"
#include "common/simdintrin.h"
#include "core/threads.h"
+#include "ringbuffer.h"
// x.8 fixed point precision values
#define FIXED_POINT_SHIFT 8
FE_WORK FeWork;
volatile OSALIGNLINE(uint32_t) FeLock;
- volatile OSALIGNLINE(bool) inUse;
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
-
- // Have all worker threads moved past draw in DC ring?
- volatile OSALIGNLINE(uint32_t) threadsDoneFE;
- volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+ volatile OSALIGNLINE(int64_t) threadsDone;
uint64_t dependency;
MacroTileMgr* pTileMgr;
// The following fields are valid if isCompute is true.
- volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute)
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
DRAW_STATE* pState;
// 3. State - When an applications sets state after draw
// a. Same as step 1.
// b. State is copied from prev draw context to current.
- DRAW_CONTEXT* dcRing;
+ RingBuffer<DRAW_CONTEXT> dcRing;
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
// These split draws all have identical state. So instead of storing the state directly
// in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
// to reference a single entry in the DS ring.
- DRAW_STATE* dsRing;
+ RingBuffer<DRAW_STATE> dsRing;
uint32_t curStateId; // Current index to the next available entry in the DS ring.
std::condition_variable FifosNotEmpty;
std::mutex WaitLock;
- // Draw Contexts will get a unique drawId generated from this
- uint64_t nextDrawId;
-
- // most recent draw id enqueued by the API thread
- // written by api thread, read by multiple workers
- OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-
DRIVER_TYPE driverType;
uint32_t privateStateSize;
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief RingBuffer
+* The RingBuffer class manages all aspects of the ring buffer including
+* the head/tail indices, etc.
+*
+******************************************************************************/
+#pragma once
+
+template<typename T>
+class RingBuffer
+{
+public:
+ RingBuffer()
+ : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
+ {
+ }
+
+ ~RingBuffer()
+ {
+ Destroy();
+ }
+
+ void Init(uint32_t numEntries)
+ {
+ SWR_ASSERT(numEntries > 0);
+ mNumEntries = numEntries;
+ mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64);
+ SWR_ASSERT(mpRingBuffer != nullptr);
+ memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+ }
+
+ void Destroy()
+ {
+ _aligned_free(mpRingBuffer);
+ mpRingBuffer = nullptr;
+ }
+
+ T& operator[](const uint32_t index)
+ {
+ SWR_ASSERT(index < mNumEntries);
+ return mpRingBuffer[index];
+ }
+
+ INLINE void Enqueue()
+ {
+ mRingHead++; // There's only one producer.
+ }
+
+ INLINE void Dequeue()
+ {
+ InterlockedIncrement(&mRingTail); // There are multiple consumers.
+ }
+
+ INLINE bool IsEmpty()
+ {
+ return (GetHead() == GetTail());
+ }
+
+ INLINE bool IsFull()
+ {
+ ///@note We don't handle wrap case due to using 64-bit indices.
+ /// It would take 11 million years to wrap at 50,000 DCs per sec.
+ /// If we used 32-bit indices then its about 23 hours to wrap.
+ uint64_t numEnqueued = GetHead() - GetTail();
+ SWR_ASSERT(numEnqueued <= mNumEntries);
+
+ return (numEnqueued == mNumEntries);
+ }
+
+ INLINE volatile uint64_t GetTail() { return mRingTail; }
+ INLINE volatile uint64_t GetHead() { return mRingHead; }
+
+private:
+ T* mpRingBuffer;
+ uint32_t mNumEntries;
+
+ OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter
+ OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter
+};
INLINE
uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
{
- //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
- //return result;
- return pContext->DrawEnqueued;
+ return pContext->dcRing.GetHead();
}
INLINE
}
}
+INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+{
+ int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+
+ if (result == 0)
+ {
+ _ReadWriteBarrier();
+
+ pContext->dcRing.Dequeue(); // Remove from tail
+ }
+}
+
INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
{
// increment our current draw id to the first incomplete draw
if (isWorkComplete)
{
curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
+ CompleteDrawContext(pContext, pDC);
}
else
{
{
// We can increment the current BE and safely move to next draw since we know this draw is complete.
curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
+ CompleteDrawContext(pContext, pDC);
lastRetiredDraw++;
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
{
+ CompleteDrawContext(pContext, pDC);
curDrawFE++;
- InterlockedIncrement(&pDC->threadsDoneFE);
}
else
{
// Is there any work remaining?
if (queue.getNumQueued() > 0)
{
- bool lastToComplete = false;
-
uint32_t threadGroupId = 0;
while (queue.getWork(threadGroupId))
{
ProcessComputeBE(pDC, workerId, threadGroupId);
- lastToComplete = queue.finishedWork();
- }
-
- _ReadWriteBarrier();
-
- if (lastToComplete)
- {
- SWR_ASSERT(queue.isWorkComplete() == true);
- pDC->doneCompute = true;
+ queue.finishedWork();
}
}
}
// the worker can safely increment its oldestDraw counter and move on to the next draw.
std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
- auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+ auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
- uint64_t curDrawBE = 1;
- uint64_t curDrawFE = 1;
+ uint64_t curDrawBE = 0;
+ uint64_t curDrawFE = 0;
while (pContext->threadPool.inThreadShutdown == false)
{