swr/rast: Consolidate archrast Draw events
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
index e67ede2610f12d3f3a2b4bb2a2de6548f331bc12..cb98cbe7eeafbad5cf3d2c9a770edeb4443c0eab 100644 (file)
@@ -43,7 +43,6 @@
 #include "core/clip.h"
 #include "core/utils.h"
 
-#include "common/simdintrin.h"
 #include "common/os.h"
 
 static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y };
@@ -73,16 +72,21 @@ HANDLE SwrCreateContext(
     memset(pContextMem, 0, sizeof(SWR_CONTEXT));
     SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
 
-    pContext->driverType = pCreateInfo->driver;
     pContext->privateStateSize = pCreateInfo->privateStateSize;
 
-    pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
-    pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+    pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
+    if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
+    {
+        pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
+    }
 
-    pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
+    pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
 
-    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+    pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
+
+    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
         new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
@@ -91,16 +95,32 @@ HANDLE SwrCreateContext(
         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
 
-    pContext->threadInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
-    pContext->threadInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
-    pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
-    pContext->threadInfo.MAX_THREADS_PER_CORE      = KNOB_MAX_THREADS_PER_CORE;
-    pContext->threadInfo.SINGLE_THREADED           = KNOB_SINGLE_THREADED;
-
     if (pCreateInfo->pThreadInfo)
     {
         pContext->threadInfo = *pCreateInfo->pThreadInfo;
     }
+    else
+    {
+        pContext->threadInfo.MAX_WORKER_THREADS         = KNOB_MAX_WORKER_THREADS;
+        pContext->threadInfo.BASE_NUMA_NODE             = KNOB_BASE_NUMA_NODE;
+        pContext->threadInfo.BASE_CORE                  = KNOB_BASE_CORE;
+        pContext->threadInfo.BASE_THREAD                = KNOB_BASE_THREAD;
+        pContext->threadInfo.MAX_NUMA_NODES             = KNOB_MAX_NUMA_NODES;
+        pContext->threadInfo.MAX_CORES_PER_NUMA_NODE    = KNOB_MAX_CORES_PER_NUMA_NODE;
+        pContext->threadInfo.MAX_THREADS_PER_CORE       = KNOB_MAX_THREADS_PER_CORE;
+        pContext->threadInfo.SINGLE_THREADED            = KNOB_SINGLE_THREADED;
+    }
+
+    if (pCreateInfo->pApiThreadInfo)
+    {
+        pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
+    }
+    else
+    {
+        pContext->apiThreadInfo.bindAPIThread0          = true;
+        pContext->apiThreadInfo.numAPIReservedThreads   = 1;
+        pContext->apiThreadInfo.numAPIThreadsPerCore    = 1;
+    }
 
     memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
     memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@@ -109,14 +129,18 @@ HANDLE SwrCreateContext(
 
     CreateThreadPool(pContext, &pContext->threadPool);
 
+    if (pContext->apiThreadInfo.bindAPIThread0)
+    {
+        BindApiThread(pContext, 0);
+    }
+
     pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
-    pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads];
+    pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
 
 #if defined(KNOB_ENABLE_AR)
     // Setup ArchRast thread contexts which includes +1 for API thread.
     pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1];
-    pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext();
-    _AR_EVENT(pContext->pArContext[pContext->NumWorkerThreads], ThreadStartApiEvent());
+    pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
 #endif
 
     // Allocate scratch space for workers.
@@ -136,20 +160,21 @@ HANDLE SwrCreateContext(
 
 #if defined(KNOB_ENABLE_AR)
         // Initialize worker thread context for ArchRast.
-        pContext->pArContext[i] = ArchRast::CreateThreadContext();
-        _AR_EVENT(pContext->pArContext[i], ThreadStartWorkerEvent());
+        pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
 #endif
     }
 
+#if defined(KNOB_ENABLE_AR)
+    // cache the API thread event manager, for use with sim layer
+    pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads];
+#endif
+
     // State setup AFTER context is fully initialized
     SetupDefaultState(pContext);
 
     // initialize hot tile manager
     pContext->pHotTileMgr = new HotTileMgr();
 
-    // initialize function pointer tables
-    InitClearTilesTable();
-
     // initialize callback functions
     pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
     pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
@@ -157,6 +182,7 @@ HANDLE SwrCreateContext(
     pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
     pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
     pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
+    
 
     // pass pointer to bucket manager back to caller
 #ifdef KNOB_ENABLE_RDTSC
@@ -179,7 +205,7 @@ template<bool IsDraw>
 void QueueWork(SWR_CONTEXT *pContext)
 {
     DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
-    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
 
     if (IsDraw)
     {
@@ -195,7 +221,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
     if (IsDraw)
     {
-        InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE);
+        InterlockedIncrement(&pContext->drawsOutstandingFE);
     }
 
     _ReadWriteBarrier();
@@ -230,9 +256,9 @@ void QueueWork(SWR_CONTEXT *pContext)
     }
     else
     {
-        AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
+        RDTSC_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
         WakeAllThreads(pContext);
-        AR_API_END(APIDrawWakeAllThreads, 1);
+        RDTSC_END(APIDrawWakeAllThreads, 1);
     }
 
     // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
@@ -252,7 +278,7 @@ INLINE void QueueDispatch(SWR_CONTEXT* pContext)
 
 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
 {
-    AR_API_BEGIN(APIGetDrawContext, 0);
+    RDTSC_BEGIN(APIGetDrawContext, 0);
     // If current draw context is null then need to obtain a new draw context to use from ring.
     if (pContext->pCurDrawContext == nullptr)
     {
@@ -263,7 +289,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         }
 
         uint64_t curDraw = pContext->dcRing.GetHead();
-        uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+        uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 
         if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
             (curDraw - pContext->lastDrawChecked) > 0x10000)
@@ -279,7 +305,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         pContext->pCurDrawContext = pCurDrawContext;
 
         // Assign next available entry in DS ring to this DC.
-        uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
+        uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
 
         // Copy previous state to current state.
@@ -317,7 +343,10 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
 
         SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
 
+        // Reset dependency
         pCurDrawContext->dependent = false;
+        pCurDrawContext->dependentFE = false;
+
         pCurDrawContext->pContext = pContext;
         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
 
@@ -338,7 +367,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
     }
 
-    AR_API_END(APIGetDrawContext, 0);
+    RDTSC_END(APIGetDrawContext, 0);
     return pContext->pCurDrawContext;
 }
 
@@ -364,9 +393,9 @@ void SwrDestroyContext(HANDLE hContext)
     DestroyThreadPool(pContext, &pContext->threadPool);
 
     // free the fifos
-    for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
+    for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
     {
-        delete[] pContext->dcRing[i].dynState.pStats;
+        AlignedFree(pContext->dcRing[i].dynState.pStats);
         delete pContext->dcRing[i].pArena;
         delete pContext->dsRing[i].pArena;
         pContext->pMacroTileManagerArray[i].~MacroTileMgr();
@@ -391,8 +420,7 @@ void SwrDestroyContext(HANDLE hContext)
     }
 
     delete[] pContext->ppScratch;
-    delete[] pContext->pArContext;
-    delete[] pContext->pStats;
+    AlignedFree(pContext->pStats);
 
     delete(pContext->pHotTileMgr);
 
@@ -400,6 +428,12 @@ void SwrDestroyContext(HANDLE hContext)
     AlignedFree(GetContext(hContext));
 }
 
+void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    BindApiThread(pContext, apiThreadId);
+}
+
 void SWR_API SwrSaveState(
     HANDLE hContext,
     void* pOutputStateBlock,
@@ -443,7 +477,7 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APISync, 0);
+    RDTSC_BEGIN(APISync, 0);
 
     pDC->FeWork.type = SYNC;
     pDC->FeWork.pfnWork = ProcessSync;
@@ -454,38 +488,48 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint
     pDC->retireCallback.userData2 = userData2;
     pDC->retireCallback.userData3 = userData3;
 
+    AR_API_EVENT(SwrSyncEvent(pDC->drawId));
+
     //enqueue
     QueueDraw(pContext);
 
-    AR_API_END(APISync, 1);
+    RDTSC_END(APISync, 1);
+}
+
+void SwrStallBE(HANDLE hContext)
+{
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->dependent = true;
 }
 
 void SwrWaitForIdle(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    AR_API_BEGIN(APIWaitForIdle, 0);
+    RDTSC_BEGIN(APIWaitForIdle, 0);
 
     while (!pContext->dcRing.IsEmpty())
     {
         _mm_pause();
     }
 
-    AR_API_END(APIWaitForIdle, 1);
+    RDTSC_END(APIWaitForIdle, 1);
 }
 
 void SwrWaitForIdleFE(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    AR_API_BEGIN(APIWaitForIdle, 0);
+    RDTSC_BEGIN(APIWaitForIdle, 0);
 
     while (pContext->drawsOutstandingFE > 0)
     {
         _mm_pause();
     }
 
-    AR_API_END(APIWaitForIdle, 1);
+    RDTSC_END(APIWaitForIdle, 1);
 }
 
 void SwrSetVertexBuffers(
@@ -590,12 +634,16 @@ void SwrSetCsFunc(
     HANDLE hContext,
     PFN_CS_FUNC pfnCsFunc,
     uint32_t totalThreadsInGroup,
-    uint32_t totalSpillFillSize)
+    uint32_t totalSpillFillSize,
+    uint32_t scratchSpaceSizePerInstance,
+    uint32_t numInstances)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
     pState->pfnCsFunc = pfnCsFunc;
     pState->totalThreadsInGroup = totalThreadsInGroup;
     pState->totalSpillFillSize = totalSpillFillSize;
+    pState->scratchSpaceSize = scratchSpaceSizePerInstance;
+    pState->scratchSpaceNumInstances = numInstances;
 }
 
 void SwrSetTsState(
@@ -678,7 +726,7 @@ void SwrSetBlendFunc(
 // update guardband multipliers for the viewport
 void updateGuardbands(API_STATE *pState)
 {
-    uint32_t numGbs = pState->gsState.emitsRenderTargetArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
 
     for(uint32_t i = 0; i < numGbs; ++i)
     {
@@ -713,43 +761,8 @@ void SwrSetViewports(
     API_STATE* pState = GetDrawState(pContext);
 
     memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
-
-    if (pMatrices != nullptr)
-    {
-        // @todo Faster to copy portions of the SOA or just copy all of it?
-        memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
-    }
-    else
-    {
-        // Compute default viewport transform.
-        for (uint32_t i = 0; i < numViewports; ++i)
-        {
-            if (pContext->driverType == DX)
-            {
-                pState->vpMatrices.m00[i] = pState->vp[i].width / 2.0f;
-                pState->vpMatrices.m11[i] = -pState->vp[i].height / 2.0f;
-                pState->vpMatrices.m22[i] = pState->vp[i].maxZ - pState->vp[i].minZ;
-                pState->vpMatrices.m30[i] = pState->vp[i].x + pState->vpMatrices.m00[i];
-                pState->vpMatrices.m31[i] = pState->vp[i].y - pState->vpMatrices.m11[i];
-                pState->vpMatrices.m32[i] = pState->vp[i].minZ;
-            }
-            else
-            {
-                // Standard, with the exception that Y is inverted.
-                pState->vpMatrices.m00[i] = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
-                pState->vpMatrices.m11[i] = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
-                pState->vpMatrices.m22[i] = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
-                pState->vpMatrices.m30[i] = pState->vp[i].x + pState->vpMatrices.m00[i];
-                pState->vpMatrices.m31[i] = pState->vp[i].height + pState->vpMatrices.m11[i];
-                pState->vpMatrices.m32[i] = pState->vp[i].minZ + pState->vpMatrices.m22[i];
-
-                // Now that the matrix is calculated, clip the view coords to screen size.
-                // OpenGL allows for -ve x,y in the viewport.
-                pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
-                pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
-            }
-        }
-    }
+    // @todo Faster to copy portions of the SOA or just copy all of it?
+    memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
 
     updateGuardbands(pState);
 }
@@ -769,7 +782,7 @@ void SwrSetScissorRects(
 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
 {
     API_STATE *pState = &pDC->pState->state;
-    uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
     pState->scissorsTileAligned = true;
 
     for (uint32_t index = 0; index < numScissors; ++index)
@@ -799,7 +812,7 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
         tileAligned  = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
         tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
         tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
-        tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_Y_DIM) == 0;
+        tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
 
         pState->scissorsTileAligned &= tileAligned;
 
@@ -815,18 +828,15 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
     }
 }
 
+
 // templated backend function tables
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
+
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
     const SWR_RASTSTATE &rastState = pState->state.rastState;
     const SWR_PS_STATE &psState = pState->state.psState;
     BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
-    const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
 
     // setup backend
     if (psState.pfnPixelShader == nullptr)
@@ -835,10 +845,10 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     }
     else
     {
-        const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
+        const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
+        const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
         const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
-        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
-
+        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
         // select backend function
@@ -849,7 +859,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern][psState.inputCoverage]
+                                                                [centroid][forcedSampleCount][canEarlyZ]
+                    ;
             }
             else
             {
@@ -859,7 +871,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
-            SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
+            SWR_ASSERT(rastState.bIsCenterPattern != true);
             // always need to generate I & J per sample for Z interpolation
             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
             backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
@@ -869,13 +881,22 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             break;
         }
     }
-    
+
+    SWR_ASSERT(backendFuncs.pfnBackend);
+
     PFN_PROCESS_PRIMS pfnBinner;
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
+#endif
     switch (pState->state.topology)
     {
     case TOP_POINT_LIST:
         pState->pfnProcessPrims = ClipPoints;
         pfnBinner = BinPoints;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
+        pfnBinner_simd16 = BinPoints_simd16;
+#endif
         break;
     case TOP_LINE_LIST:
     case TOP_LINE_STRIP:
@@ -884,19 +905,32 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     case TOP_LISTSTRIP_ADJ:
         pState->pfnProcessPrims = ClipLines;
         pfnBinner = BinLines;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipLines_simd16;
+        pfnBinner_simd16 = BinLines_simd16;
+#endif
         break;
     default:
         pState->pfnProcessPrims = ClipTriangles;
         pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
+        pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
+#endif
         break;
     };
 
-    // disable clipper if viewport transform is disabled
+
+    // Disable clipper if viewport transform is disabled
     if (pState->state.frontendState.vpTransformDisable)
     {
         pState->pfnProcessPrims = pfnBinner;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
+#endif
     }
 
+    // Disable rasterizer and backend if no pixel, no depth/stencil, and no attributes
     if ((pState->state.psState.pfnPixelShader == nullptr) &&
         (pState->state.depthStencilState.depthTestEnable == FALSE) &&
         (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
@@ -905,13 +939,20 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
         (pState->state.backendState.numAttributes == 0))
     {
         pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = nullptr;
+#endif
     }
 
     if (pState->state.soState.rasterizerDisable == true)
     {
         pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = nullptr;
+#endif
     }
 
+
     // set up the frontend attribute count
     pState->state.feNumAttributes = 0;
     const SWR_BACKEND_STATE& backendState = pState->state.backendState;
@@ -947,9 +988,11 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
     pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
                                            !pState->state.depthStencilState.depthWriteEnable &&
+                                           !pState->state.depthBoundsState.depthBoundsTestEnable &&
                                            pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && 
                                         (pState->state.depthStencilState.depthTestEnable || 
-                                         pState->state.depthStencilState.depthWriteEnable)) ? true : false;
+                                         pState->state.depthStencilState.depthWriteEnable ||
+                                         pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false;
 
     pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
                                              !pState->state.depthStencilState.stencilWriteEnable &&
@@ -961,20 +1004,31 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
                                           (pState->state.depthStencilState.stencilTestEnable  ||
                                            pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
 
-    uint32_t numRTs = pState->state.psState.numRenderTargets;
-    pState->state.colorHottileEnable = 0;
+
+    uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
+
+    // Disable hottile for surfaces with no writes
     if (psState.pfnPixelShader != nullptr)
     {
-        for (uint32_t rt = 0; rt < numRTs; ++rt)
+        DWORD rt;
+        uint32_t rtMask = pState->state.psState.renderTargetMask;
+        while (_BitScanForward(&rt, rtMask))
         {
-            pState->state.colorHottileEnable |=  
-                (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
+            rtMask &= ~(1 << rt);
+
+            if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
+                pState->state.blendState.renderTarget[rt].writeDisableRed &&
+                pState->state.blendState.renderTarget[rt].writeDisableGreen &&
+                pState->state.blendState.renderTarget[rt].writeDisableBlue)
+            {
+                hotTileEnable &= ~(1 << rt);
+            }
         }
     }
 
+    pState->state.colorHottileEnable = hotTileEnable;
+
+
     // Setup depth quantization function
     if (pState->state.depthHottileEnable)
     {
@@ -984,7 +1038,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
         case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
         case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
         case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
-        default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
+        default: SWR_INVALID("Unsupported depth format for depth quantiztion.");
             pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
         }
     }
@@ -1008,6 +1062,8 @@ void InitDraw(
         SetupMacroTileScissors(pDC);
         SetupPipeline(pDC);
     }
+    
+
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1112,8 +1168,8 @@ void DrawInstanced(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIDraw, pDC->drawId);
-    AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertices, startVertex, numInstances, startInstance));
+    RDTSC_BEGIN(APIDraw, pDC->drawId);
+    AR_API_EVENT(DrawInstancedEvent(pDC->drawId, ArchRast::Instanced, topology, numVertices, startVertex, numInstances, startInstance));
 
     uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
@@ -1165,6 +1221,8 @@ void DrawInstanced(
         //enqueue DC
         QueueDraw(pContext);
 
+        AR_API_EVENT(DrawInstancedSplitEvent(pDC->drawId, ArchRast::InstancedSplit));
+
         remainingVerts -= numVertsForDraw;
         draw++;
     }
@@ -1173,7 +1231,7 @@ void DrawInstanced(
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
 
-    AR_API_END(APIDraw, numVertices * numInstances);
+    RDTSC_END(APIDraw, numVertices * numInstances);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1238,8 +1296,8 @@ void DrawIndexedInstance(
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
     API_STATE* pState = &pDC->pState->state;
 
-    AR_API_BEGIN(APIDrawIndexed, pDC->drawId);
-    AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
+    RDTSC_BEGIN(APIDrawIndexed, pDC->drawId);
+    AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, ArchRast::IndexedInstancedSplit, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
 
     uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
@@ -1252,7 +1310,7 @@ void DrawIndexedInstance(
     case R16_UINT: indexSize = sizeof(uint16_t); break;
     case R8_UINT: indexSize = sizeof(uint8_t); break;
     default:
-        SWR_ASSERT(0);
+        SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
     }
 
     int draw = 0;
@@ -1281,6 +1339,7 @@ void DrawIndexedInstance(
 
         // When breaking up draw, we need to obtain new draw context for each iteration.
         bool isSplitDraw = (draw > 0) ? true : false;
+
         pDC = GetDrawContext(pContext, isSplitDraw);
         InitDraw(pDC, isSplitDraw);
 
@@ -1307,16 +1366,18 @@ void DrawIndexedInstance(
         //enqueue DC
         QueueDraw(pContext);
 
+        AR_API_EVENT(DrawIndexedInstancedSplitEvent(pDC->drawId, ArchRast::IndexedInstancedSplit));
+
         pIB += maxIndicesPerDraw * indexSize;
         remainingIndices -= numIndicesForDraw;
         draw++;
     }
 
-    // restore culling state
+    // Restore culling state
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
-
-    AR_API_END(APIDrawIndexed, numIndices * numInstances);
+    RDTSC_END(APIDrawIndexed, numIndices * numInstances);
 }
 
 
@@ -1389,6 +1450,8 @@ void SWR_API SwrInvalidateTiles(
 
     //enqueue
     QueueDraw(pContext);
+
+    AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1422,6 +1485,8 @@ void SWR_API SwrDiscardRect(
 
     //enqueue
     QueueDraw(pContext);
+
+    AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1444,7 +1509,7 @@ void SwrDispatch(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIDispatch, pDC->drawId);
+    RDTSC_BEGIN(APIDispatch, pDC->drawId);
     AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
     pDC->isCompute = true;      // This is a compute context.
 
@@ -1455,12 +1520,12 @@ void SwrDispatch(
     pTaskData->threadGroupCountZ = threadGroupCountZ;
 
     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
-    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
     pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
     pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
 
     QueueDispatch(pContext);
-    AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
+    RDTSC_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
 }
 
 // Deswizzles, converts and stores current contents of the hot tiles to surface
@@ -1479,7 +1544,7 @@ void SWR_API SwrStoreTiles(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIStoreTiles, pDC->drawId);
+    RDTSC_BEGIN(APIStoreTiles, pDC->drawId);
 
     pDC->FeWork.type = STORETILES;
     pDC->FeWork.pfnWork = ProcessStoreTiles;
@@ -1491,20 +1556,24 @@ void SWR_API SwrStoreTiles(
     //enqueue
     QueueDraw(pContext);
 
-    AR_API_END(APIStoreTiles, 1);
+    AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
+
+    RDTSC_END(APIStoreTiles, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
 /// @param hContext - Handle passed back from SwrCreateContext
-/// @param clearMask - combination of SWR_CLEAR_COLOR / SWR_CLEAR_DEPTH / SWR_CLEAR_STENCIL flags (or SWR_CLEAR_NONE)
+/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
+/// @param renderTargetArrayIndex - the RT array index to clear
 /// @param clearColor - color use for clearing render targets
 /// @param z - depth value use for clearing depth buffer
 /// @param stencil - stencil value used for clearing stencil buffer
 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
 void SWR_API SwrClearRenderTarget(
     HANDLE hContext,
-    uint32_t clearMask,
+    uint32_t attachmentMask,
+    uint32_t renderTargetArrayIndex,
     const float clearColor[4],
     float z,
     uint8_t stencil,
@@ -1518,17 +1587,14 @@ void SWR_API SwrClearRenderTarget(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIClearRenderTarget, pDC->drawId);
-
-    CLEAR_FLAGS flags;
-    flags.bits = 0;
-    flags.mask = clearMask;
+    RDTSC_BEGIN(APIClearRenderTarget, pDC->drawId);
 
     pDC->FeWork.type = CLEAR;
     pDC->FeWork.pfnWork = ProcessClear;
     pDC->FeWork.desc.clear.rect = clearRect;
     pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
-    pDC->FeWork.desc.clear.flags = flags;
+    pDC->FeWork.desc.clear.attachmentMask = attachmentMask;
+    pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
     pDC->FeWork.desc.clear.clearDepth = z;
     pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
     pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
@@ -1539,7 +1605,7 @@ void SWR_API SwrClearRenderTarget(
     // enqueue draw
     QueueDraw(pContext);
 
-    AR_API_END(APIClearRenderTarget, 1);
+    RDTSC_END(APIClearRenderTarget, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1586,14 +1652,28 @@ VOID* SwrAllocDrawContextMemory(
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-void SwrEnableStats(
+void SwrEnableStatsFE(
+    HANDLE hContext,
+    bool enable)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->pState->state.enableStatsFE = enable;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Enables stats counting
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param enable - If true then counts are incremented.
+void SwrEnableStatsBE(
     HANDLE hContext,
     bool enable)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    pDC->pState->state.enableStats = enable;
+    pDC->pState->state.enableStatsBE = enable;
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1604,9 +1684,84 @@ void SWR_API SwrEndFrame(
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    (void)pDC; // var used
 
     RDTSC_ENDFRAME();
     AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
 
     pContext->frameCount++;
 }
+
+void InitSimLoadTilesTable();
+void InitSimStoreTilesTable();
+void InitSimClearTilesTable();
+
+void InitClearTilesTable();
+void InitBackendFuncTables();
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Initialize swr backend and memory internal tables
+void SwrInit()
+{
+    InitSimLoadTilesTable();
+    InitSimStoreTilesTable();
+    InitSimClearTilesTable();
+
+    InitClearTilesTable();
+    InitBackendFuncTables();
+    InitRasterizerFunctions();
+}
+
+void SwrGetInterface(SWR_INTERFACE &out_funcs)
+{
+    out_funcs.pfnSwrCreateContext = SwrCreateContext;
+    out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
+    out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
+    out_funcs.pfnSwrSaveState = SwrSaveState;
+    out_funcs.pfnSwrRestoreState = SwrRestoreState;
+    out_funcs.pfnSwrSync = SwrSync;
+    out_funcs.pfnSwrStallBE = SwrStallBE;
+    out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
+    out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
+    out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
+    out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer;
+    out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc;
+    out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc;
+    out_funcs.pfnSwrSetSoState = SwrSetSoState;
+    out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers;
+    out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc;
+    out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState;
+    out_funcs.pfnSwrSetGsState = SwrSetGsState;
+    out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc;
+    out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc;
+    out_funcs.pfnSwrSetTsState = SwrSetTsState;
+    out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc;
+    out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc;
+    out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState;
+    out_funcs.pfnSwrSetBackendState = SwrSetBackendState;
+    out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState;
+    out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState;
+    out_funcs.pfnSwrSetBlendState = SwrSetBlendState;
+    out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc;
+    out_funcs.pfnSwrDraw = SwrDraw;
+    out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced;
+    out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed;
+    out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced;
+    out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles;
+    out_funcs.pfnSwrDiscardRect = SwrDiscardRect;
+    out_funcs.pfnSwrDispatch = SwrDispatch;
+    out_funcs.pfnSwrStoreTiles = SwrStoreTiles;
+    out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget;
+    out_funcs.pfnSwrSetRastState = SwrSetRastState;
+    out_funcs.pfnSwrSetViewports = SwrSetViewports;
+    out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects;
+    out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
+    out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
+    out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE;
+    out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE;
+    out_funcs.pfnSwrEndFrame = SwrEndFrame;
+    out_funcs.pfnSwrInit = SwrInit;
+    out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile;
+    out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface;
+    out_funcs.pfnSwrStoreHotTileClear = SwrStoreHotTileClear;
+}