swr/rast: Consolidate archrast Draw events
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
index d6aa80d678f615acbe157247920300ecbd8aa7af..cb98cbe7eeafbad5cf3d2c9a770edeb4443c0eab 100644 (file)
 #include "core/clip.h"
 #include "core/utils.h"
 
-#include "common/simdintrin.h"
 #include "common/os.h"
 
+static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y };
+
 void SetupDefaultState(SWR_CONTEXT *pContext);
 
 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
@@ -53,6 +54,11 @@ static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
     return (SWR_CONTEXT*)hContext;
 }
 
+void WakeAllThreads(SWR_CONTEXT *pContext)
+{
+    pContext->FifosNotEmpty.notify_all();
+}
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
@@ -66,27 +72,21 @@ HANDLE SwrCreateContext(
     memset(pContextMem, 0, sizeof(SWR_CONTEXT));
     SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
 
-    pContext->driverType = pCreateInfo->driver;
     pContext->privateStateSize = pCreateInfo->privateStateSize;
 
-    pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
-    pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
-
-    pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-
-    pContext->threadInfo.MAX_WORKER_THREADS        = KNOB_MAX_WORKER_THREADS;
-    pContext->threadInfo.MAX_NUMA_NODES            = KNOB_MAX_NUMA_NODES;
-    pContext->threadInfo.MAX_CORES_PER_NUMA_NODE   = KNOB_MAX_CORES_PER_NUMA_NODE;
-    pContext->threadInfo.MAX_THREADS_PER_CORE      = KNOB_MAX_THREADS_PER_CORE;
-    pContext->threadInfo.SINGLE_THREADED           = KNOB_SINGLE_THREADED;
-
-    if (pCreateInfo->pThreadInfo)
+    pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
+    if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
     {
-        pContext->threadInfo = *pCreateInfo->pThreadInfo;
+        pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
     }
 
-    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+    pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
+    pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
+
+    pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
+
+    for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
         new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
@@ -95,24 +95,54 @@ HANDLE SwrCreateContext(
         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
 
-    if (!pContext->threadInfo.SINGLE_THREADED)
+    if (pCreateInfo->pThreadInfo)
     {
-        memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
-        memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
-        new (&pContext->WaitLock) std::mutex();
-        new (&pContext->FifosNotEmpty) std::condition_variable();
+        pContext->threadInfo = *pCreateInfo->pThreadInfo;
+    }
+    else
+    {
+        pContext->threadInfo.MAX_WORKER_THREADS         = KNOB_MAX_WORKER_THREADS;
+        pContext->threadInfo.BASE_NUMA_NODE             = KNOB_BASE_NUMA_NODE;
+        pContext->threadInfo.BASE_CORE                  = KNOB_BASE_CORE;
+        pContext->threadInfo.BASE_THREAD                = KNOB_BASE_THREAD;
+        pContext->threadInfo.MAX_NUMA_NODES             = KNOB_MAX_NUMA_NODES;
+        pContext->threadInfo.MAX_CORES_PER_NUMA_NODE    = KNOB_MAX_CORES_PER_NUMA_NODE;
+        pContext->threadInfo.MAX_THREADS_PER_CORE       = KNOB_MAX_THREADS_PER_CORE;
+        pContext->threadInfo.SINGLE_THREADED            = KNOB_SINGLE_THREADED;
+    }
 
-        CreateThreadPool(pContext, &pContext->threadPool);
+    if (pCreateInfo->pApiThreadInfo)
+    {
+        pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
+    }
+    else
+    {
+        pContext->apiThreadInfo.bindAPIThread0          = true;
+        pContext->apiThreadInfo.numAPIReservedThreads   = 1;
+        pContext->apiThreadInfo.numAPIThreadsPerCore    = 1;
     }
 
-    // Calling createThreadPool() above can set SINGLE_THREADED
-    if (pContext->threadInfo.SINGLE_THREADED)
+    memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
+    memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
+    new (&pContext->WaitLock) std::mutex();
+    new (&pContext->FifosNotEmpty) std::condition_variable();
+
+    CreateThreadPool(pContext, &pContext->threadPool);
+
+    if (pContext->apiThreadInfo.bindAPIThread0)
     {
-        pContext->NumWorkerThreads = 1;
-        pContext->NumFEThreads = 1;
-        pContext->NumBEThreads = 1;
+        BindApiThread(pContext, 0);
     }
 
+    pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
+    pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
+
+#if defined(KNOB_ENABLE_AR)
+    // Setup ArchRast thread contexts which includes +1 for API thread.
+    pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1];
+    pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
+#endif
+
     // Allocate scratch space for workers.
     ///@note We could lazily allocate this but its rather small amount of memory.
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
@@ -120,24 +150,31 @@ HANDLE SwrCreateContext(
 #if defined(_WIN32)
         uint32_t numaNode = pContext->threadPool.pThreadData ?
             pContext->threadPool.pThreadData[i].numaId : 0;
-        pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
+        pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(
             GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
             MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
             numaNode);
 #else
-        pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+        pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+#endif
+
+#if defined(KNOB_ENABLE_AR)
+        // Initialize worker thread context for ArchRast.
+        pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
 #endif
     }
 
+#if defined(KNOB_ENABLE_AR)
+    // cache the API thread event manager, for use with sim layer
+    pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads];
+#endif
+
     // State setup AFTER context is fully initialized
     SetupDefaultState(pContext);
 
     // initialize hot tile manager
     pContext->pHotTileMgr = new HotTileMgr();
 
-    // initialize function pointer tables
-    InitClearTilesTable();
-
     // initialize callback functions
     pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
     pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
@@ -145,6 +182,7 @@ HANDLE SwrCreateContext(
     pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
     pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
     pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
+    
 
     // pass pointer to bucket manager back to caller
 #ifdef KNOB_ENABLE_RDTSC
@@ -153,40 +191,9 @@ HANDLE SwrCreateContext(
 
     pCreateInfo->contextSaveSize = sizeof(API_STATE);
 
-    return (HANDLE)pContext;
-}
-
-void SwrDestroyContext(HANDLE hContext)
-{
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DestroyThreadPool(pContext, &pContext->threadPool);
-
-    // free the fifos
-    for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
-    {
-        delete pContext->dcRing[i].pArena;
-        delete pContext->dsRing[i].pArena;
-        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
-        pContext->pDispatchQueueArray[i].~DispatchQueue();
-    }
-
-    AlignedFree(pContext->pDispatchQueueArray);
-    AlignedFree(pContext->pMacroTileManagerArray);
-
-    // Free scratch space.
-    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
-    {
-#if defined(_WIN32)
-        VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
-#else
-        AlignedFree(pContext->pScratch[i]);
-#endif
-    }
-
-    delete(pContext->pHotTileMgr);
+    StartThreadPool(pContext, &pContext->threadPool);
 
-    pContext->~SWR_CONTEXT();
-    AlignedFree(GetContext(hContext));
+    return (HANDLE)pContext;
 }
 
 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
@@ -194,18 +201,11 @@ void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
     memcpy(&dst.state, &src.state, sizeof(API_STATE));
 }
 
-void WakeAllThreads(SWR_CONTEXT *pContext)
-{
-    pContext->FifosNotEmpty.notify_all();
-}
-
-static TileSet gSingleThreadLockedTiles;
-
 template<bool IsDraw>
 void QueueWork(SWR_CONTEXT *pContext)
 {
     DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
-    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
 
     if (IsDraw)
     {
@@ -221,7 +221,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
     if (IsDraw)
     {
-        InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE);
+        InterlockedIncrement(&pContext->drawsOutstandingFE);
     }
 
     _ReadWriteBarrier();
@@ -240,7 +240,7 @@ void QueueWork(SWR_CONTEXT *pContext)
         {
             uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
             WorkOnFifoFE(pContext, 0, curDraw[0]);
-            WorkOnFifoBE(pContext, 0, curDraw[1], gSingleThreadLockedTiles, 0, 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
         }
         else
         {
@@ -256,9 +256,9 @@ void QueueWork(SWR_CONTEXT *pContext)
     }
     else
     {
-        RDTSC_START(APIDrawWakeAllThreads);
+        RDTSC_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
         WakeAllThreads(pContext);
-        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
+        RDTSC_END(APIDrawWakeAllThreads, 1);
     }
 
     // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
@@ -278,7 +278,7 @@ INLINE void QueueDispatch(SWR_CONTEXT* pContext)
 
 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
 {
-    RDTSC_START(APIGetDrawContext);
+    RDTSC_BEGIN(APIGetDrawContext, 0);
     // If current draw context is null then need to obtain a new draw context to use from ring.
     if (pContext->pCurDrawContext == nullptr)
     {
@@ -289,25 +289,23 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         }
 
         uint64_t curDraw = pContext->dcRing.GetHead();
-        uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+        uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 
-        static uint64_t lastDrawChecked;
-        static uint32_t lastFrameChecked;
-        if ((pContext->frameCount - lastFrameChecked) > 2 ||
-            (curDraw - lastDrawChecked) > 0x10000)
+        if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
+            (curDraw - pContext->lastDrawChecked) > 0x10000)
         {
             // Take this opportunity to clean-up old arena allocations
             pContext->cachingArenaAllocator.FreeOldBlocks();
 
-            lastFrameChecked = pContext->frameCount;
-            lastDrawChecked = curDraw;
+            pContext->lastFrameChecked = pContext->frameCount;
+            pContext->lastDrawChecked = curDraw;
         }
 
         DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
         pContext->pCurDrawContext = pCurDrawContext;
 
         // Assign next available entry in DS ring to this DC.
-        uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
+        uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
 
         // Copy previous state to current state.
@@ -345,7 +343,10 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
 
         SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
 
+        // Reset dependency
         pCurDrawContext->dependent = false;
+        pCurDrawContext->dependentFE = false;
+
         pCurDrawContext->pContext = pContext;
         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
 
@@ -354,7 +355,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         pCurDrawContext->threadsDone = 0;
         pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
 
-        memset(&pCurDrawContext->dynState, 0, sizeof(pCurDrawContext->dynState));
+        pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
 
         // Assign unique drawId for this DC
         pCurDrawContext->drawId = pContext->dcRing.GetHead();
@@ -366,7 +367,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
     }
 
-    RDTSC_STOP(APIGetDrawContext, 0, 0);
+    RDTSC_END(APIGetDrawContext, 0);
     return pContext->pCurDrawContext;
 }
 
@@ -378,6 +379,61 @@ API_STATE* GetDrawState(SWR_CONTEXT *pContext)
     return &pDC->pState->state;
 }
 
+void SwrDestroyContext(HANDLE hContext)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->FeWork.type = SHUTDOWN;
+    pDC->FeWork.pfnWork = ProcessShutdown;
+
+    //enqueue
+    QueueDraw(pContext);
+
+    DestroyThreadPool(pContext, &pContext->threadPool);
+
+    // free the fifos
+    for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
+    {
+        AlignedFree(pContext->dcRing[i].dynState.pStats);
+        delete pContext->dcRing[i].pArena;
+        delete pContext->dsRing[i].pArena;
+        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+        pContext->pDispatchQueueArray[i].~DispatchQueue();
+    }
+
+    AlignedFree(pContext->pDispatchQueueArray);
+    AlignedFree(pContext->pMacroTileManagerArray);
+
+    // Free scratch space.
+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+    {
+#if defined(_WIN32)
+        VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
+#else
+        AlignedFree(pContext->ppScratch[i]);
+#endif
+
+#if defined(KNOB_ENABLE_AR)
+        ArchRast::DestroyThreadContext(pContext->pArContext[i]);
+#endif
+    }
+
+    delete[] pContext->ppScratch;
+    AlignedFree(pContext->pStats);
+
+    delete(pContext->pHotTileMgr);
+
+    pContext->~SWR_CONTEXT();
+    AlignedFree(GetContext(hContext));
+}
+
+void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    BindApiThread(pContext, apiThreadId);
+}
+
 void SWR_API SwrSaveState(
     HANDLE hContext,
     void* pOutputStateBlock,
@@ -408,17 +464,21 @@ void SetupDefaultState(SWR_CONTEXT *pContext)
 
     pState->rastState.cullMode = SWR_CULLMODE_NONE;
     pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
+
+    pState->depthBoundsState.depthBoundsTestEnable = false;
+    pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
+    pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
 }
 
 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
 {
-    RDTSC_START(APISync);
-
     SWR_ASSERT(pfnFunc != nullptr);
 
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    RDTSC_BEGIN(APISync, 0);
+
     pDC->FeWork.type = SYNC;
     pDC->FeWork.pfnWork = ProcessSync;
 
@@ -428,38 +488,48 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint
     pDC->retireCallback.userData2 = userData2;
     pDC->retireCallback.userData3 = userData3;
 
+    AR_API_EVENT(SwrSyncEvent(pDC->drawId));
+
     //enqueue
     QueueDraw(pContext);
 
-    RDTSC_STOP(APISync, 1, 0);
+    RDTSC_END(APISync, 1);
+}
+
+void SwrStallBE(HANDLE hContext)
+{
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->dependent = true;
 }
 
 void SwrWaitForIdle(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    RDTSC_START(APIWaitForIdle);
+    RDTSC_BEGIN(APIWaitForIdle, 0);
 
     while (!pContext->dcRing.IsEmpty())
     {
         _mm_pause();
     }
 
-    RDTSC_STOP(APIWaitForIdle, 1, 0);
+    RDTSC_END(APIWaitForIdle, 1);
 }
 
 void SwrWaitForIdleFE(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    RDTSC_START(APIWaitForIdle);
+    RDTSC_BEGIN(APIWaitForIdle, 0);
 
     while (pContext->drawsOutstandingFE > 0)
     {
         _mm_pause();
     }
 
-    RDTSC_STOP(APIWaitForIdle, 1, 0);
+    RDTSC_END(APIWaitForIdle, 1);
 }
 
 void SwrSetVertexBuffers(
@@ -564,12 +634,16 @@ void SwrSetCsFunc(
     HANDLE hContext,
     PFN_CS_FUNC pfnCsFunc,
     uint32_t totalThreadsInGroup,
-    uint32_t totalSpillFillSize)
+    uint32_t totalSpillFillSize,
+    uint32_t scratchSpaceSizePerInstance,
+    uint32_t numInstances)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
     pState->pfnCsFunc = pfnCsFunc;
     pState->totalThreadsInGroup = totalThreadsInGroup;
     pState->totalSpillFillSize = totalSpillFillSize;
+    pState->scratchSpaceSize = scratchSpaceSizePerInstance;
+    pState->scratchSpaceNumInstances = numInstances;
 }
 
 void SwrSetTsState(
@@ -614,6 +688,15 @@ void SwrSetBackendState(
     pState->backendState = *pBEState;
 }
 
+void SwrSetDepthBoundsState(
+    HANDLE hContext,
+    SWR_DEPTH_BOUNDS_STATE *pDBState)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    pState->depthBoundsState = *pDBState;
+}
+
 void SwrSetPixelShaderState(
     HANDLE hContext,
     SWR_PS_STATE *pPSState)
@@ -641,13 +724,18 @@ void SwrSetBlendFunc(
 }
 
 // update guardband multipliers for the viewport
-void updateGuardband(API_STATE *pState)
+void updateGuardbands(API_STATE *pState)
 {
-    // guardband center is viewport center
-    pState->gbState.left    = KNOB_GUARDBAND_WIDTH  / pState->vp[0].width;
-    pState->gbState.right   = KNOB_GUARDBAND_WIDTH  / pState->vp[0].width;
-    pState->gbState.top     = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
-    pState->gbState.bottom  = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
+    uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+
+    for(uint32_t i = 0; i < numGbs; ++i)
+    {
+        // guardband center is viewport center
+        pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
+        pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
+        pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
+        pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
+    }
 }
 
 void SwrSetRastState(
@@ -673,115 +761,82 @@ void SwrSetViewports(
     API_STATE* pState = GetDrawState(pContext);
 
     memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
+    // @todo Faster to copy portions of the SOA or just copy all of it?
+    memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
 
-    if (pMatrices != nullptr)
-    {
-        //memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
-        // @todo Faster to copy portions of the SOA or just copy all of it?
-        memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
-    }
-    else
-    {
-        // Compute default viewport transform.
-        for (uint32_t i = 0; i < numViewports; ++i)
-        {
-            if (pContext->driverType == DX)
-            {
-                pState->vpMatrices.m00[i] = pState->vp[i].width / 2.0f;
-                pState->vpMatrices.m11[i] = -pState->vp[i].height / 2.0f;
-                pState->vpMatrices.m22[i] = pState->vp[i].maxZ - pState->vp[i].minZ;
-                pState->vpMatrices.m30[i] = pState->vp[i].x + pState->vpMatrices.m00[i];
-                pState->vpMatrices.m31[i] = pState->vp[i].y - pState->vpMatrices.m11[i];
-                pState->vpMatrices.m32[i] = pState->vp[i].minZ;
-            }
-            else
-            {
-                // Standard, with the exception that Y is inverted.
-                pState->vpMatrices.m00[i] = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
-                pState->vpMatrices.m11[i] = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
-                pState->vpMatrices.m22[i] = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
-                pState->vpMatrices.m30[i] = pState->vp[i].x + pState->vpMatrices.m00[i];
-                pState->vpMatrices.m31[i] = pState->vp[i].height + pState->vpMatrices.m11[i];
-                pState->vpMatrices.m32[i] = pState->vp[i].minZ + pState->vpMatrices.m22[i];
-
-                // Now that the matrix is calculated, clip the view coords to screen size.
-                // OpenGL allows for -ve x,y in the viewport.
-                pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
-                pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
-            }
-        }
-    }
-
-    updateGuardband(pState);
+    updateGuardbands(pState);
 }
 
 void SwrSetScissorRects(
     HANDLE hContext,
     uint32_t numScissors,
-    const BBOX* pScissors)
+    const SWR_RECT* pScissors)
 {
     SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
         "Invalid number of scissor rects.");
 
     API_STATE* pState = GetDrawState(GetContext(hContext));
-    memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
+    memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
 };
 
 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
 {
     API_STATE *pState = &pDC->pState->state;
-    uint32_t left, right, top, bottom;
+    uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    pState->scissorsTileAligned = true;
 
-    // Set up scissor dimensions based on scissor or viewport
-    if (pState->rastState.scissorEnable)
+    for (uint32_t index = 0; index < numScissors; ++index)
     {
-        // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
-        left = pState->scissorRects[0].left;
-        right = pState->scissorRects[0].right;
-        top = pState->scissorRects[0].top;
-        bottom = pState->scissorRects[0].bottom;
-    }
-    else
-    {
-        // the vp width and height must be added to origin un-rounded then the result round to -inf.
-        // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
-        left = (int32_t)pState->vp[0].x;
-        right = (int32_t)(pState->vp[0].x + pState->vp[0].width);
-        top = (int32_t)pState->vp[0].y;
-        bottom = (int32_t)(pState->vp[0].y + pState->vp[0].height);
-    }
+        SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index];
 
-    right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
-    bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
+        // Set up scissor dimensions based on scissor or viewport
+        if (pState->rastState.scissorEnable)
+        {
+            scissorInFixedPoint = pState->scissorRects[index];
+        }
+        else
+        {
+            // the vp width and height must be added to origin un-rounded then the result round to -inf.
+            // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
+            scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
+            scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
+            scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
+            scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
+        }
 
-    if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
-    {
-        pState->scissorInFixedPoint.left = 0;
-        pState->scissorInFixedPoint.right = 0;
-        pState->scissorInFixedPoint.top = 0;
-        pState->scissorInFixedPoint.bottom = 0;
-    }
-    else
-    {
-        pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
-        pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
-        pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
-        pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
+        // Clamp to max rect
+        scissorInFixedPoint &= g_MaxScissorRect;
+
+        // Test for tile alignment
+        bool tileAligned;
+        tileAligned  = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
+        tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
+        tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
+        tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
+
+        pState->scissorsTileAligned &= tileAligned;
+
+        // Scale to fixed point
+        scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
+        scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
+        scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
+        scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
+
+        // Make scissor inclusive
+        scissorInFixedPoint.xmax -= 1;
+        scissorInFixedPoint.ymax -= 1;
     }
 }
 
+
 // templated backend function tables
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
+
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
     const SWR_RASTSTATE &rastState = pState->state.rastState;
     const SWR_PS_STATE &psState = pState->state.psState;
     BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
-    const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
 
     // setup backend
     if (psState.pfnPixelShader == nullptr)
@@ -790,10 +845,10 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     }
     else
     {
-        const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
+        const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
+        const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
         const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
-        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
-     
+        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
         // select backend function
@@ -804,7 +859,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern][psState.inputCoverage]
+                                                                [centroid][forcedSampleCount][canEarlyZ]
+                    ;
             }
             else
             {
@@ -814,7 +871,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
-            SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
+            SWR_ASSERT(rastState.bIsCenterPattern != true);
             // always need to generate I & J per sample for Z interpolation
             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
             backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
@@ -824,13 +881,22 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             break;
         }
     }
-    
+
+    SWR_ASSERT(backendFuncs.pfnBackend);
+
     PFN_PROCESS_PRIMS pfnBinner;
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
+#endif
     switch (pState->state.topology)
     {
     case TOP_POINT_LIST:
         pState->pfnProcessPrims = ClipPoints;
         pfnBinner = BinPoints;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
+        pfnBinner_simd16 = BinPoints_simd16;
+#endif
         break;
     case TOP_LINE_LIST:
     case TOP_LINE_STRIP:
@@ -839,19 +905,32 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     case TOP_LISTSTRIP_ADJ:
         pState->pfnProcessPrims = ClipLines;
         pfnBinner = BinLines;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipLines_simd16;
+        pfnBinner_simd16 = BinLines_simd16;
+#endif
         break;
     default:
         pState->pfnProcessPrims = ClipTriangles;
         pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
+        pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
+#endif
         break;
     };
 
-    // disable clipper if viewport transform is disabled
+
+    // Disable clipper if viewport transform is disabled
     if (pState->state.frontendState.vpTransformDisable)
     {
         pState->pfnProcessPrims = pfnBinner;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
+#endif
     }
 
+    // Disable rasterizer and backend if no pixel, no depth/stencil, and no attributes
     if ((pState->state.psState.pfnPixelShader == nullptr) &&
         (pState->state.depthStencilState.depthTestEnable == FALSE) &&
         (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
@@ -860,13 +939,20 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
         (pState->state.backendState.numAttributes == 0))
     {
         pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = nullptr;
+#endif
     }
 
     if (pState->state.soState.rasterizerDisable == true)
     {
         pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = nullptr;
+#endif
     }
 
+
     // set up the frontend attribute count
     pState->state.feNumAttributes = 0;
     const SWR_BACKEND_STATE& backendState = pState->state.backendState;
@@ -902,9 +988,11 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
     pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
                                            !pState->state.depthStencilState.depthWriteEnable &&
+                                           !pState->state.depthBoundsState.depthBoundsTestEnable &&
                                            pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && 
                                         (pState->state.depthStencilState.depthTestEnable || 
-                                         pState->state.depthStencilState.depthWriteEnable)) ? true : false;
+                                         pState->state.depthStencilState.depthWriteEnable ||
+                                         pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false;
 
     pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
                                              !pState->state.depthStencilState.stencilWriteEnable &&
@@ -916,20 +1004,31 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
                                           (pState->state.depthStencilState.stencilTestEnable  ||
                                            pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
 
-    uint32_t numRTs = pState->state.psState.numRenderTargets;
-    pState->state.colorHottileEnable = 0;
+
+    uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
+
+    // Disable hottile for surfaces with no writes
     if (psState.pfnPixelShader != nullptr)
     {
-        for (uint32_t rt = 0; rt < numRTs; ++rt)
+        DWORD rt;
+        uint32_t rtMask = pState->state.psState.renderTargetMask;
+        while (_BitScanForward(&rt, rtMask))
         {
-            pState->state.colorHottileEnable |=  
-                (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
-                 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
+            rtMask &= ~(1 << rt);
+
+            if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
+                pState->state.blendState.renderTarget[rt].writeDisableRed &&
+                pState->state.blendState.renderTarget[rt].writeDisableGreen &&
+                pState->state.blendState.renderTarget[rt].writeDisableBlue)
+            {
+                hotTileEnable &= ~(1 << rt);
+            }
         }
     }
 
+    pState->state.colorHottileEnable = hotTileEnable;
+
+
     // Setup depth quantization function
     if (pState->state.depthHottileEnable)
     {
@@ -939,7 +1038,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
         case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
         case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
         case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
-        default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
+        default: SWR_INVALID("Unsupported depth format for depth quantiztion.");
             pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
         }
     }
@@ -963,6 +1062,8 @@ void InitDraw(
         SetupMacroTileScissors(pDC);
         SetupPipeline(pDC);
     }
+    
+
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1064,11 +1165,12 @@ void DrawInstanced(
         return;
     }
 
-    RDTSC_START(APIDraw);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    RDTSC_BEGIN(APIDraw, pDC->drawId);
+    AR_API_EVENT(DrawInstancedEvent(pDC->drawId, ArchRast::Instanced, topology, numVertices, startVertex, numInstances, startInstance));
+
     uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
     uint32_t remainingVerts = numVertices;
@@ -1084,6 +1186,10 @@ void DrawInstanced(
         pState->rastState.cullMode = SWR_CULLMODE_NONE;
         pState->forceFront = true;
     }
+    else if (topology == TOP_RECT_LIST)
+    {
+        pState->rastState.cullMode = SWR_CULLMODE_NONE;
+    }
 
     int draw = 0;
     while (remainingVerts)
@@ -1115,6 +1221,8 @@ void DrawInstanced(
         //enqueue DC
         QueueDraw(pContext);
 
+        AR_API_EVENT(DrawInstancedSplitEvent(pDC->drawId, ArchRast::InstancedSplit));
+
         remainingVerts -= numVertsForDraw;
         draw++;
     }
@@ -1123,7 +1231,7 @@ void DrawInstanced(
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
 
-    RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
+    RDTSC_END(APIDraw, numVertices * numInstances);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1184,12 +1292,13 @@ void DrawIndexedInstance(
         return;
     }
 
-    RDTSC_START(APIDrawIndexed);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
     API_STATE* pState = &pDC->pState->state;
 
+    RDTSC_BEGIN(APIDrawIndexed, pDC->drawId);
+    AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, ArchRast::IndexedInstancedSplit, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
+
     uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
     uint32_t remainingIndices = numIndices;
@@ -1201,7 +1310,7 @@ void DrawIndexedInstance(
     case R16_UINT: indexSize = sizeof(uint16_t); break;
     case R8_UINT: indexSize = sizeof(uint8_t); break;
     default:
-        SWR_ASSERT(0);
+        SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
     }
 
     int draw = 0;
@@ -1218,6 +1327,10 @@ void DrawIndexedInstance(
         pState->rastState.cullMode = SWR_CULLMODE_NONE;
         pState->forceFront = true;
     }
+    else if (topology == TOP_RECT_LIST)
+    {
+        pState->rastState.cullMode = SWR_CULLMODE_NONE;
+    }
 
     while (remainingIndices)
     {
@@ -1226,6 +1339,7 @@ void DrawIndexedInstance(
 
         // When breaking up draw, we need to obtain new draw context for each iteration.
         bool isSplitDraw = (draw > 0) ? true : false;
+
         pDC = GetDrawContext(pContext, isSplitDraw);
         InitDraw(pDC, isSplitDraw);
 
@@ -1252,16 +1366,18 @@ void DrawIndexedInstance(
         //enqueue DC
         QueueDraw(pContext);
 
+        AR_API_EVENT(DrawIndexedInstancedSplitEvent(pDC->drawId, ArchRast::IndexedInstancedSplit));
+
         pIB += maxIndicesPerDraw * indexSize;
         remainingIndices -= numIndicesForDraw;
         draw++;
     }
 
-    // restore culling state
+    // Restore culling state
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
-
-    RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
+    RDTSC_END(APIDrawIndexed, numIndices * numInstances);
 }
 
 
@@ -1308,9 +1424,12 @@ void SwrDrawIndexedInstanced(
 /// @brief SwrInvalidateTiles
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
-void SwrInvalidateTiles(
+/// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
+///                         be hottile size-aligned.
+void SWR_API SwrInvalidateTiles(
     HANDLE hContext,
-    uint32_t attachmentMask)
+    uint32_t attachmentMask,
+    const SWR_RECT& invalidateRect)
 {
     if (KNOB_TOSS_DRAW)
     {
@@ -1323,24 +1442,28 @@ void SwrInvalidateTiles(
     pDC->FeWork.type = DISCARDINVALIDATETILES;
     pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
-    memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
+    pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect;
+    pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
     pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
     pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
 
     //enqueue
     QueueDraw(pContext);
+
+    AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrDiscardRect
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
-/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
-void SwrDiscardRect(
+/// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
+///               discarded.
+void SWR_API SwrDiscardRect(
     HANDLE hContext,
     uint32_t attachmentMask,
-    SWR_RECT rect)
+    const SWR_RECT& rect)
 {
     if (KNOB_TOSS_DRAW)
     {
@@ -1355,12 +1478,15 @@ void SwrDiscardRect(
     pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
     pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
+    pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
     pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
     pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
 
     //enqueue
     QueueDraw(pContext);
+
+    AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1380,10 +1506,11 @@ void SwrDispatch(
         return;
     }
 
-    RDTSC_START(APIDispatch);
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    RDTSC_BEGIN(APIDispatch, pDC->drawId);
+    AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
     pDC->isCompute = true;      // This is a compute context.
 
     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
@@ -1393,70 +1520,81 @@ void SwrDispatch(
     pTaskData->threadGroupCountZ = threadGroupCountZ;
 
     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
-    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
     pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
-    pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
+    pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
 
     QueueDispatch(pContext);
-    RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
+    RDTSC_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
 }
 
 // Deswizzles, converts and stores current contents of the hot tiles to surface
 // described by pState
-void SwrStoreTiles(
+void SWR_API SwrStoreTiles(
     HANDLE hContext,
-    SWR_RENDERTARGET_ATTACHMENT attachment,
-    SWR_TILE_STATE postStoreTileState)
+    uint32_t attachmentMask,
+    SWR_TILE_STATE postStoreTileState,
+    const SWR_RECT& storeRect)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    RDTSC_START(APIStoreTiles);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    SetupMacroTileScissors(pDC);
+    RDTSC_BEGIN(APIStoreTiles, pDC->drawId);
 
     pDC->FeWork.type = STORETILES;
     pDC->FeWork.pfnWork = ProcessStoreTiles;
-    pDC->FeWork.desc.storeTiles.attachment = attachment;
+    pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask;
     pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
+    pDC->FeWork.desc.storeTiles.rect = storeRect;
+    pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
 
     //enqueue
     QueueDraw(pContext);
 
-    RDTSC_STOP(APIStoreTiles, 0, 0);
+    AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
+
+    RDTSC_END(APIStoreTiles, 1);
 }
 
-void SwrClearRenderTarget(
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
+/// @param renderTargetArrayIndex - the RT array index to clear
+/// @param clearColor - color use for clearing render targets
+/// @param z - depth value use for clearing depth buffer
+/// @param stencil - stencil value used for clearing stencil buffer
+/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
+void SWR_API SwrClearRenderTarget(
     HANDLE hContext,
-    uint32_t clearMask,
+    uint32_t attachmentMask,
+    uint32_t renderTargetArrayIndex,
     const float clearColor[4],
     float z,
-    uint8_t stencil)
+    uint8_t stencil,
+    const SWR_RECT& clearRect)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    RDTSC_START(APIClearRenderTarget);
-
     SWR_CONTEXT *pContext = GetContext(hContext);
-
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    SetupMacroTileScissors(pDC);
-
-    CLEAR_FLAGS flags;
-    flags.mask = clearMask;
+    RDTSC_BEGIN(APIClearRenderTarget, pDC->drawId);
 
     pDC->FeWork.type = CLEAR;
     pDC->FeWork.pfnWork = ProcessClear;
-    pDC->FeWork.desc.clear.flags = flags;
+    pDC->FeWork.desc.clear.rect = clearRect;
+    pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
+    pDC->FeWork.desc.clear.attachmentMask = attachmentMask;
+    pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
     pDC->FeWork.desc.clear.clearDepth = z;
     pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
     pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
@@ -1467,7 +1605,7 @@ void SwrClearRenderTarget(
     // enqueue draw
     QueueDraw(pContext);
 
-    RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
+    RDTSC_END(APIClearRenderTarget, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1514,14 +1652,28 @@ VOID* SwrAllocDrawContextMemory(
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-void SwrEnableStats(
+void SwrEnableStatsFE(
     HANDLE hContext,
     bool enable)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    pDC->pState->state.enableStats = enable;
+    pDC->pState->state.enableStatsFE = enable;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Enables stats counting
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param enable - If true then counts are incremented.
+void SwrEnableStatsBE(
+    HANDLE hContext,
+    bool enable)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->pState->state.enableStatsBE = enable;
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1530,7 +1682,86 @@ void SwrEnableStats(
 void SWR_API SwrEndFrame(
     HANDLE hContext)
 {
-    RDTSC_ENDFRAME();
     SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    (void)pDC; // var used
+
+    RDTSC_ENDFRAME();
+    AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
+
     pContext->frameCount++;
 }
+
+void InitSimLoadTilesTable();
+void InitSimStoreTilesTable();
+void InitSimClearTilesTable();
+
+void InitClearTilesTable();
+void InitBackendFuncTables();
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Initialize swr backend and memory internal tables
+void SwrInit()
+{
+    InitSimLoadTilesTable();
+    InitSimStoreTilesTable();
+    InitSimClearTilesTable();
+
+    InitClearTilesTable();
+    InitBackendFuncTables();
+    InitRasterizerFunctions();
+}
+
+void SwrGetInterface(SWR_INTERFACE &out_funcs)
+{
+    out_funcs.pfnSwrCreateContext = SwrCreateContext;
+    out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
+    out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
+    out_funcs.pfnSwrSaveState = SwrSaveState;
+    out_funcs.pfnSwrRestoreState = SwrRestoreState;
+    out_funcs.pfnSwrSync = SwrSync;
+    out_funcs.pfnSwrStallBE = SwrStallBE;
+    out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
+    out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
+    out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
+    out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer;
+    out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc;
+    out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc;
+    out_funcs.pfnSwrSetSoState = SwrSetSoState;
+    out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers;
+    out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc;
+    out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState;
+    out_funcs.pfnSwrSetGsState = SwrSetGsState;
+    out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc;
+    out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc;
+    out_funcs.pfnSwrSetTsState = SwrSetTsState;
+    out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc;
+    out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc;
+    out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState;
+    out_funcs.pfnSwrSetBackendState = SwrSetBackendState;
+    out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState;
+    out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState;
+    out_funcs.pfnSwrSetBlendState = SwrSetBlendState;
+    out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc;
+    out_funcs.pfnSwrDraw = SwrDraw;
+    out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced;
+    out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed;
+    out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced;
+    out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles;
+    out_funcs.pfnSwrDiscardRect = SwrDiscardRect;
+    out_funcs.pfnSwrDispatch = SwrDispatch;
+    out_funcs.pfnSwrStoreTiles = SwrStoreTiles;
+    out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget;
+    out_funcs.pfnSwrSetRastState = SwrSetRastState;
+    out_funcs.pfnSwrSetViewports = SwrSetViewports;
+    out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects;
+    out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
+    out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
+    out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE;
+    out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE;
+    out_funcs.pfnSwrEndFrame = SwrEndFrame;
+    out_funcs.pfnSwrInit = SwrInit;
+    out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile;
+    out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface;
+    out_funcs.pfnSwrStoreHotTileClear = SwrStoreHotTileClear;
+}