X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fswr%2Frasterizer%2Fcore%2Fapi.cpp;h=8dc9ac24a745cd4a4e11352c5fd35174a2ff191d;hb=f253798205a3ce7f577867a96ce487bf20e10909;hp=6f9c4027719ea7fee60a4af9f805d99fefa1cc6d;hpb=eb6b2b340e685e977148784f72289c919ce6c74f;p=mesa.git diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 6f9c4027719..8dc9ac24a74 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -1,5 +1,5 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -43,9 +43,10 @@ #include "core/clip.h" #include "core/utils.h" -#include "common/simdintrin.h" #include "common/os.h" +static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y }; + void SetupDefaultState(SWR_CONTEXT *pContext); static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) @@ -53,6 +54,11 @@ static INLINE SWR_CONTEXT* GetContext(HANDLE hContext) return (SWR_CONTEXT*)hContext; } +void WakeAllThreads(SWR_CONTEXT *pContext) +{ + pContext->FifosNotEmpty.notify_all(); +} + ////////////////////////////////////////////////////////////////////////// /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. @@ -66,7 +72,6 @@ HANDLE SwrCreateContext( memset(pContextMem, 0, sizeof(SWR_CONTEXT)); SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT(); - pContext->driverType = pCreateInfo->driver; pContext->privateStateSize = pCreateInfo->privateStateSize; pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); @@ -84,25 +89,33 @@ HANDLE SwrCreateContext( pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } - if (!KNOB_SINGLE_THREADED) - { - memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); - memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); - new (&pContext->WaitLock) std::mutex(); - new (&pContext->FifosNotEmpty) std::condition_variable(); - - CreateThreadPool(pContext, &pContext->threadPool); - } + pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS; + pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES; + pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE; + pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE; + pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED; - // Calling createThreadPool() above can set SINGLE_THREADED - if (KNOB_SINGLE_THREADED) + if (pCreateInfo->pThreadInfo) { - SET_KNOB(HYPERTHREADED_FE, false); - pContext->NumWorkerThreads = 1; - pContext->NumFEThreads = 1; - pContext->NumBEThreads = 1; + pContext->threadInfo = *pCreateInfo->pThreadInfo; } + memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock)); + memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty)); + new (&pContext->WaitLock) std::mutex(); + new (&pContext->FifosNotEmpty) std::condition_variable(); + + CreateThreadPool(pContext, &pContext->threadPool); + + pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads]; + pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64); + +#if defined(KNOB_ENABLE_AR) + // Setup ArchRast thread contexts which includes +1 for API thread. + pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1]; + pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API); +#endif + // Allocate scratch space for workers. ///@note We could lazily allocate this but its rather small amount of memory. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) @@ -110,12 +123,17 @@ HANDLE SwrCreateContext( #if defined(_WIN32) uint32_t numaNode = pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0; - pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma( + pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma( GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, numaNode); #else - pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); + pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); +#endif + +#if defined(KNOB_ENABLE_AR) + // Initialize worker thread context for ArchRast. + pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER); #endif } @@ -125,13 +143,14 @@ HANDLE SwrCreateContext( // initialize hot tile manager pContext->pHotTileMgr = new HotTileMgr(); - // initialize function pointer tables - InitClearTilesTable(); - - // initialize store tiles function + // initialize callback functions pContext->pfnLoadTile = pCreateInfo->pfnLoadTile; pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; pContext->pfnClearTile = pCreateInfo->pfnClearTile; + pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset; + pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats; + pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE; + // pass pointer to bucket manager back to caller #ifdef KNOB_ENABLE_RDTSC @@ -140,40 +159,9 @@ HANDLE SwrCreateContext( pCreateInfo->contextSaveSize = sizeof(API_STATE); - return (HANDLE)pContext; -} - -void SwrDestroyContext(HANDLE hContext) -{ - SWR_CONTEXT *pContext = GetContext(hContext); - DestroyThreadPool(pContext, &pContext->threadPool); - - // free the fifos - for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) - { - delete pContext->dcRing[i].pArena; - delete pContext->dsRing[i].pArena; - pContext->pMacroTileManagerArray[i].~MacroTileMgr(); - pContext->pDispatchQueueArray[i].~DispatchQueue(); - } - - AlignedFree(pContext->pDispatchQueueArray); - AlignedFree(pContext->pMacroTileManagerArray); - - // Free scratch space. - for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) - { -#if defined(_WIN32) - VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE); -#else - AlignedFree(pContext->pScratch[i]); -#endif - } + StartThreadPool(pContext, &pContext->threadPool); - delete(pContext->pHotTileMgr); - - pContext->~SWR_CONTEXT(); - AlignedFree(GetContext(hContext)); + return (HANDLE)pContext; } void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) @@ -181,13 +169,6 @@ void CopyState(DRAW_STATE& dst, const DRAW_STATE& src) memcpy(&dst.state, &src.state, sizeof(API_STATE)); } -void WakeAllThreads(SWR_CONTEXT *pContext) -{ - pContext->FifosNotEmpty.notify_all(); -} - -static TileSet gSingleThreadLockedTiles; - template void QueueWork(SWR_CONTEXT *pContext) { @@ -206,13 +187,18 @@ void QueueWork(SWR_CONTEXT *pContext) // then moved on if all work is done.) pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads; + if (IsDraw) + { + InterlockedIncrement((volatile long*)&pContext->drawsOutstandingFE); + } + _ReadWriteBarrier(); { std::unique_lock lock(pContext->WaitLock); pContext->dcRing.Enqueue(); } - if (KNOB_SINGLE_THREADED) + if (pContext->threadInfo.SINGLE_THREADED) { // flush denormals to 0 uint32_t mxcsr = _mm_getcsr(); @@ -222,7 +208,7 @@ void QueueWork(SWR_CONTEXT *pContext) { uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; WorkOnFifoFE(pContext, 0, curDraw[0]); - WorkOnFifoBE(pContext, 0, curDraw[1], gSingleThreadLockedTiles, 0, 0); + WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0); } else { @@ -238,9 +224,9 @@ void QueueWork(SWR_CONTEXT *pContext) } else { - RDTSC_START(APIDrawWakeAllThreads); + AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId); WakeAllThreads(pContext); - RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); + AR_API_END(APIDrawWakeAllThreads, 1); } // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. @@ -260,7 +246,7 @@ INLINE void QueueDispatch(SWR_CONTEXT* pContext) DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) { - RDTSC_START(APIGetDrawContext); + AR_API_BEGIN(APIGetDrawContext, 0); // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { @@ -273,16 +259,14 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) uint64_t curDraw = pContext->dcRing.GetHead(); uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; - static uint64_t lastDrawChecked; - static uint32_t lastFrameChecked; - if ((pContext->frameCount - lastFrameChecked) > 2 || - (curDraw - lastDrawChecked) > 0x10000) + if ((pContext->frameCount - pContext->lastFrameChecked) > 2 || + (curDraw - pContext->lastDrawChecked) > 0x10000) { // Take this opportunity to clean-up old arena allocations pContext->cachingArenaAllocator.FreeOldBlocks(); - lastFrameChecked = pContext->frameCount; - lastDrawChecked = curDraw; + pContext->lastFrameChecked = pContext->frameCount; + pContext->lastDrawChecked = curDraw; } DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; @@ -327,13 +311,19 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true); + // Reset dependency pCurDrawContext->dependent = false; + pCurDrawContext->dependentFE = false; + pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; pCurDrawContext->threadsDone = 0; + pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr; + + pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads); // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->dcRing.GetHead(); @@ -345,7 +335,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); } - RDTSC_STOP(APIGetDrawContext, 0, 0); + AR_API_END(APIGetDrawContext, 0); return pContext->pCurDrawContext; } @@ -357,6 +347,55 @@ API_STATE* GetDrawState(SWR_CONTEXT *pContext) return &pDC->pState->state; } +void SwrDestroyContext(HANDLE hContext) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + + pDC->FeWork.type = SHUTDOWN; + pDC->FeWork.pfnWork = ProcessShutdown; + + //enqueue + QueueDraw(pContext); + + DestroyThreadPool(pContext, &pContext->threadPool); + + // free the fifos + for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i) + { + AlignedFree(pContext->dcRing[i].dynState.pStats); + delete pContext->dcRing[i].pArena; + delete pContext->dsRing[i].pArena; + pContext->pMacroTileManagerArray[i].~MacroTileMgr(); + pContext->pDispatchQueueArray[i].~DispatchQueue(); + } + + AlignedFree(pContext->pDispatchQueueArray); + AlignedFree(pContext->pMacroTileManagerArray); + + // Free scratch space. + for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) + { +#if defined(_WIN32) + VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE); +#else + AlignedFree(pContext->ppScratch[i]); +#endif + +#if defined(KNOB_ENABLE_AR) + ArchRast::DestroyThreadContext(pContext->pArContext[i]); +#endif + } + + delete[] pContext->ppScratch; + AlignedFree(pContext->pStats); + + delete(pContext->pHotTileMgr); + + pContext->~SWR_CONTEXT(); + AlignedFree(GetContext(hContext)); +} + void SWR_API SwrSaveState( HANDLE hContext, void* pOutputStateBlock, @@ -387,45 +426,64 @@ void SetupDefaultState(SWR_CONTEXT *pContext) pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->rastState.frontWinding = SWR_FRONTWINDING_CCW; + + pState->depthBoundsState.depthBoundsTestEnable = false; + pState->depthBoundsState.depthBoundsTestMinValue = 0.0f; + pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f; } void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3) { - RDTSC_START(APISync); - SWR_ASSERT(pfnFunc != nullptr); SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); + AR_API_BEGIN(APISync, 0); + pDC->FeWork.type = SYNC; pDC->FeWork.pfnWork = ProcessSync; - pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc; - pDC->FeWork.desc.sync.userData = userData; - pDC->FeWork.desc.sync.userData2 = userData2; - pDC->FeWork.desc.sync.userData3 = userData3; - // cannot execute until all previous draws have completed - pDC->dependent = true; + // Setup callback function + pDC->retireCallback.pfnCallbackFunc = pfnFunc; + pDC->retireCallback.userData = userData; + pDC->retireCallback.userData2 = userData2; + pDC->retireCallback.userData3 = userData3; + + AR_API_EVENT(SwrSyncEvent(pDC->drawId)); //enqueue QueueDraw(pContext); - RDTSC_STOP(APISync, 1, 0); + AR_API_END(APISync, 1); } void SwrWaitForIdle(HANDLE hContext) { SWR_CONTEXT *pContext = GetContext(hContext); - RDTSC_START(APIWaitForIdle); + AR_API_BEGIN(APIWaitForIdle, 0); while (!pContext->dcRing.IsEmpty()) { _mm_pause(); } - RDTSC_STOP(APIWaitForIdle, 1, 0); + AR_API_END(APIWaitForIdle, 1); +} + +void SwrWaitForIdleFE(HANDLE hContext) +{ + SWR_CONTEXT *pContext = GetContext(hContext); + + AR_API_BEGIN(APIWaitForIdle, 0); + + while (pContext->drawsOutstandingFE > 0) + { + _mm_pause(); + } + + AR_API_END(APIWaitForIdle, 1); } void SwrSetVertexBuffers( @@ -530,12 +588,16 @@ void SwrSetCsFunc( HANDLE hContext, PFN_CS_FUNC pfnCsFunc, uint32_t totalThreadsInGroup, - uint32_t totalSpillFillSize) + uint32_t totalSpillFillSize, + uint32_t scratchSpaceSizePerInstance, + uint32_t numInstances) { API_STATE* pState = GetDrawState(GetContext(hContext)); pState->pfnCsFunc = pfnCsFunc; pState->totalThreadsInGroup = totalThreadsInGroup; pState->totalSpillFillSize = totalSpillFillSize; + pState->scratchSpaceSize = scratchSpaceSizePerInstance; + pState->scratchSpaceNumInstances = numInstances; } void SwrSetTsState( @@ -580,6 +642,15 @@ void SwrSetBackendState( pState->backendState = *pBEState; } +void SwrSetDepthBoundsState( + HANDLE hContext, + SWR_DEPTH_BOUNDS_STATE *pDBState) +{ + API_STATE* pState = GetDrawState(GetContext(hContext)); + + pState->depthBoundsState = *pDBState; +} + void SwrSetPixelShaderState( HANDLE hContext, SWR_PS_STATE *pPSState) @@ -606,39 +677,19 @@ void SwrSetBlendFunc( pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; } -void SwrSetLinkage( - HANDLE hContext, - uint32_t mask, - const uint8_t* pMap) +// update guardband multipliers for the viewport +void updateGuardbands(API_STATE *pState) { - API_STATE* pState = GetDrawState(GetContext(hContext)); - - static const uint8_t IDENTITY_MAP[] = - { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - }; - static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap), - "Update for new value of MAX_ATTRIBUTES"); - - pState->linkageMask = mask; - pState->linkageCount = _mm_popcnt_u32(mask); + uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; - if (!pMap) + for(uint32_t i = 0; i < numGbs; ++i) { - pMap = IDENTITY_MAP; + // guardband center is viewport center + pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; + pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width; + pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; + pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height; } - memcpy(pState->linkageMap, pMap, pState->linkageCount); -} - -// update guardband multipliers for the viewport -void updateGuardband(API_STATE *pState) -{ - // guardband center is viewport center - pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; - pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width; - pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; - pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height; } void SwrSetRastState( @@ -655,7 +706,7 @@ void SwrSetViewports( HANDLE hContext, uint32_t numViewports, const SWR_VIEWPORT* pViewports, - const SWR_VIEWPORT_MATRIX* pMatrices) + const SWR_VIEWPORT_MATRICES* pMatrices) { SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports."); @@ -664,111 +715,82 @@ void SwrSetViewports( API_STATE* pState = GetDrawState(pContext); memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); + // @todo Faster to copy portions of the SOA or just copy all of it? + memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES)); - if (pMatrices != nullptr) - { - memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports); - } - else - { - // Compute default viewport transform. - for (uint32_t i = 0; i < numViewports; ++i) - { - if (pContext->driverType == DX) - { - pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f; - pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f; - pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ; - pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; - pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11; - pState->vpMatrix[i].m32 = pState->vp[i].minZ; - } - else - { - // Standard, with the exception that Y is inverted. - pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f; - pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f; - pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f; - pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; - pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11; - pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22; - - // Now that the matrix is calculated, clip the view coords to screen size. - // OpenGL allows for -ve x,y in the viewport. - pState->vp[i].x = std::max(pState->vp[i].x, 0.0f); - pState->vp[i].y = std::max(pState->vp[i].y, 0.0f); - } - } - } - - updateGuardband(pState); + updateGuardbands(pState); } void SwrSetScissorRects( HANDLE hContext, uint32_t numScissors, - const BBOX* pScissors) + const SWR_RECT* pScissors) { SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects."); API_STATE* pState = GetDrawState(GetContext(hContext)); - memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX)); + memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0])); }; void SetupMacroTileScissors(DRAW_CONTEXT *pDC) { API_STATE *pState = &pDC->pState->state; - uint32_t left, right, top, bottom; + uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; + pState->scissorsTileAligned = true; - // Set up scissor dimensions based on scissor or viewport - if (pState->rastState.scissorEnable) + for (uint32_t index = 0; index < numScissors; ++index) { - // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges - left = pState->scissorRects[0].left; - right = pState->scissorRects[0].right; - top = pState->scissorRects[0].top; - bottom = pState->scissorRects[0].bottom; - } - else - { - left = (int32_t)pState->vp[0].x; - right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width; - top = (int32_t)pState->vp[0].y; - bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height; - } + SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index]; + + // Set up scissor dimensions based on scissor or viewport + if (pState->rastState.scissorEnable) + { + scissorInFixedPoint = pState->scissorRects[index]; + } + else + { + // the vp width and height must be added to origin un-rounded then the result round to -inf. + // The cast to int works for rounding assuming all [left, right, top, bottom] are positive. + scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x; + scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width); + scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y; + scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height); + } - right = std::min(right, KNOB_MAX_SCISSOR_X); - bottom = std::min(bottom, KNOB_MAX_SCISSOR_Y); + // Clamp to max rect + scissorInFixedPoint &= g_MaxScissorRect; - if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y) - { - pState->scissorInFixedPoint.left = 0; - pState->scissorInFixedPoint.right = 0; - pState->scissorInFixedPoint.top = 0; - pState->scissorInFixedPoint.bottom = 0; - } - else - { - pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE; - pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1; - pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE; - pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; + // Test for tile alignment + bool tileAligned; + tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; + tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0; + tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0; + tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0; + + pState->scissorsTileAligned &= tileAligned; + + // Scale to fixed point + scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; + scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; + scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; + scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; + + // Make scissor inclusive + scissorInFixedPoint.xmax -= 1; + scissorInFixedPoint.ymax -= 1; } } + // templated backend function tables -extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2]; -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2][2]; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; + void SetupPipeline(DRAW_CONTEXT *pDC) { DRAW_STATE* pState = pDC->pState; const SWR_RASTSTATE &rastState = pState->state.rastState; const SWR_PS_STATE &psState = pState->state.psState; BACKEND_FUNCS& backendFuncs = pState->backendFuncs; - const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; // setup backend if (psState.pfnPixelShader == nullptr) @@ -777,11 +799,10 @@ void SetupPipeline(DRAW_CONTEXT *pDC) } else { - const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0; + const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; + const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0; const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; - const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; - const uint32_t inputCoverage = (psState.inputCoverage != SWR_INPUT_COVERAGE_NONE); - + const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0; SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; // select backend function @@ -792,33 +813,44 @@ void SetupPipeline(DRAW_CONTEXT *pDC) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][inputCoverage][centroid][forcedSampleCount][canEarlyZ]; + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern][psState.inputCoverage] + [centroid][forcedSampleCount][canEarlyZ] + ; } else { // always need to generate I & J per pixel for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = gBackendSingleSample[inputCoverage][centroid][canEarlyZ]; + backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; } break; case SWR_SHADING_RATE_SAMPLE: - SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); + SWR_ASSERT(rastState.bIsCenterPattern != true); // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][inputCoverage][centroid][canEarlyZ]; + backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ]; break; default: SWR_ASSERT(0 && "Invalid shading rate"); break; } } - + + SWR_ASSERT(backendFuncs.pfnBackend); + PFN_PROCESS_PRIMS pfnBinner; +#if USE_SIMD16_FRONTEND + PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16; +#endif switch (pState->state.topology) { case TOP_POINT_LIST: pState->pfnProcessPrims = ClipPoints; pfnBinner = BinPoints; +#if USE_SIMD16_FRONTEND + pState->pfnProcessPrims_simd16 = ClipPoints_simd16; + pfnBinner_simd16 = BinPoints_simd16; +#endif break; case TOP_LINE_LIST: case TOP_LINE_STRIP: @@ -827,17 +859,29 @@ void SetupPipeline(DRAW_CONTEXT *pDC) case TOP_LISTSTRIP_ADJ: pState->pfnProcessPrims = ClipLines; pfnBinner = BinLines; +#if USE_SIMD16_FRONTEND + pState->pfnProcessPrims_simd16 = ClipLines_simd16; + pfnBinner_simd16 = BinLines_simd16; +#endif break; default: pState->pfnProcessPrims = ClipTriangles; pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0)); +#if USE_SIMD16_FRONTEND + pState->pfnProcessPrims_simd16 = ClipTriangles_simd16; + pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0)); +#endif break; }; + // disable clipper if viewport transform is disabled if (pState->state.frontendState.vpTransformDisable) { pState->pfnProcessPrims = pfnBinner; +#if USE_SIMD16_FRONTEND + pState->pfnProcessPrims_simd16 = pfnBinner_simd16; +#endif } if ((pState->state.psState.pfnPixelShader == nullptr) && @@ -845,25 +889,51 @@ void SetupPipeline(DRAW_CONTEXT *pDC) (pState->state.depthStencilState.depthWriteEnable == FALSE) && (pState->state.depthStencilState.stencilTestEnable == FALSE) && (pState->state.depthStencilState.stencilWriteEnable == FALSE) && - (pState->state.linkageCount == 0)) + (pState->state.backendState.numAttributes == 0)) { pState->pfnProcessPrims = nullptr; - pState->state.linkageMask = 0; +#if USE_SIMD16_FRONTEND + pState->pfnProcessPrims_simd16 = nullptr; +#endif } if (pState->state.soState.rasterizerDisable == true) { pState->pfnProcessPrims = nullptr; - pState->state.linkageMask = 0; +#if USE_SIMD16_FRONTEND + pState->pfnProcessPrims_simd16 = nullptr; +#endif + } + + + // set up the frontend attribute count + pState->state.feNumAttributes = 0; + const SWR_BACKEND_STATE& backendState = pState->state.backendState; + if (backendState.swizzleEnable) + { + // attribute swizzling is enabled, iterate over the map and record the max attribute used + for (uint32_t i = 0; i < backendState.numAttributes; ++i) + { + pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1); + } + } + else + { + pState->state.feNumAttributes = pState->state.backendState.numAttributes; } - // set up the frontend attrib mask - pState->state.feAttribMask = pState->state.linkageMask; if (pState->state.soState.soEnable) { + uint32_t streamMasks = 0; for (uint32_t i = 0; i < 4; ++i) { - pState->state.feAttribMask |= pState->state.soState.streamMasks[i]; + streamMasks |= pState->state.soState.streamMasks[i]; + } + + DWORD maxAttrib; + if (_BitScanReverse(&maxAttrib, streamMasks)) + { + pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1)); } } @@ -871,9 +941,11 @@ void SetupPipeline(DRAW_CONTEXT *pDC) // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled. pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable && !pState->state.depthStencilState.depthWriteEnable && + !pState->state.depthBoundsState.depthBoundsTestEnable && pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && (pState->state.depthStencilState.depthTestEnable || - pState->state.depthStencilState.depthWriteEnable)) ? true : false; + pState->state.depthStencilState.depthWriteEnable || + pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false; pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable && !pState->state.depthStencilState.stencilWriteEnable && @@ -885,20 +957,31 @@ void SetupPipeline(DRAW_CONTEXT *pDC) (pState->state.depthStencilState.stencilTestEnable || pState->state.depthStencilState.stencilWriteEnable)) ? true : false; - uint32_t numRTs = pState->state.psState.numRenderTargets; - pState->state.colorHottileEnable = 0; + + uint32_t hotTileEnable = pState->state.psState.renderTargetMask; + + // Disable hottile for surfaces with no writes if (psState.pfnPixelShader != nullptr) { - for (uint32_t rt = 0; rt < numRTs; ++rt) + DWORD rt; + uint32_t rtMask = pState->state.psState.renderTargetMask; + while (_BitScanForward(&rt, rtMask)) { - pState->state.colorHottileEnable |= - (!pState->state.blendState.renderTarget[rt].writeDisableAlpha || - !pState->state.blendState.renderTarget[rt].writeDisableRed || - !pState->state.blendState.renderTarget[rt].writeDisableGreen || - !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; + rtMask &= ~(1 << rt); + + if (pState->state.blendState.renderTarget[rt].writeDisableAlpha && + pState->state.blendState.renderTarget[rt].writeDisableRed && + pState->state.blendState.renderTarget[rt].writeDisableGreen && + pState->state.blendState.renderTarget[rt].writeDisableBlue) + { + hotTileEnable &= ~(1 << rt); + } } } + pState->state.colorHottileEnable = hotTileEnable; + + // Setup depth quantization function if (pState->state.depthHottileEnable) { @@ -908,7 +991,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC) case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break; case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break; case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break; - default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion."); + default: SWR_INVALID("Unsupported depth format for depth quantiztion."); pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; } } @@ -932,6 +1015,8 @@ void InitDraw( SetupMacroTileScissors(pDC); SetupPipeline(pDC); } + + } ////////////////////////////////////////////////////////////////////////// @@ -1033,11 +1118,12 @@ void DrawInstanced( return; } - RDTSC_START(APIDraw); - SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); + AR_API_BEGIN(APIDraw, pDC->drawId); + AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertices, startVertex, numInstances, startInstance)); + uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); uint32_t remainingVerts = numVertices; @@ -1053,6 +1139,10 @@ void DrawInstanced( pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->forceFront = true; } + else if (topology == TOP_RECT_LIST) + { + pState->rastState.cullMode = SWR_CULLMODE_NONE; + } int draw = 0; while (remainingVerts) @@ -1084,6 +1174,8 @@ void DrawInstanced( //enqueue DC QueueDraw(pContext); + AR_API_EVENT(DrawInstancedSplitEvent(pDC->drawId)); + remainingVerts -= numVertsForDraw; draw++; } @@ -1092,7 +1184,7 @@ void DrawInstanced( pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; - RDTSC_STOP(APIDraw, numVertices * numInstances, 0); + AR_API_END(APIDraw, numVertices * numInstances); } ////////////////////////////////////////////////////////////////////////// @@ -1153,12 +1245,13 @@ void DrawIndexedInstance( return; } - RDTSC_START(APIDrawIndexed); - SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); API_STATE* pState = &pDC->pState->state; + AR_API_BEGIN(APIDrawIndexed, pDC->drawId); + AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance)); + uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); uint32_t remainingIndices = numIndices; @@ -1170,7 +1263,7 @@ void DrawIndexedInstance( case R16_UINT: indexSize = sizeof(uint16_t); break; case R8_UINT: indexSize = sizeof(uint8_t); break; default: - SWR_ASSERT(0); + SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format); } int draw = 0; @@ -1187,6 +1280,10 @@ void DrawIndexedInstance( pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->forceFront = true; } + else if (topology == TOP_RECT_LIST) + { + pState->rastState.cullMode = SWR_CULLMODE_NONE; + } while (remainingIndices) { @@ -1195,6 +1292,7 @@ void DrawIndexedInstance( // When breaking up draw, we need to obtain new draw context for each iteration. bool isSplitDraw = (draw > 0) ? true : false; + pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); @@ -1221,16 +1319,18 @@ void DrawIndexedInstance( //enqueue DC QueueDraw(pContext); + AR_API_EVENT(DrawIndexedInstancedSplitEvent(pDC->drawId)); + pIB += maxIndicesPerDraw * indexSize; remainingIndices -= numIndicesForDraw; draw++; } - // restore culling state + // Restore culling state pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; - - RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0); + + AR_API_END(APIDrawIndexed, numIndices * numInstances); } @@ -1277,9 +1377,12 @@ void SwrDrawIndexedInstanced( /// @brief SwrInvalidateTiles /// @param hContext - Handle passed back from SwrCreateContext /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. -void SwrInvalidateTiles( +/// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to +/// be hottile size-aligned. +void SWR_API SwrInvalidateTiles( HANDLE hContext, - uint32_t attachmentMask) + uint32_t attachmentMask, + const SWR_RECT& invalidateRect) { if (KNOB_TOSS_DRAW) { @@ -1292,24 +1395,28 @@ void SwrInvalidateTiles( pDC->FeWork.type = DISCARDINVALIDATETILES; pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; - memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT)); + pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect; + pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false; pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; //enqueue QueueDraw(pContext); + + AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId)); } ////////////////////////////////////////////////////////////////////////// /// @brief SwrDiscardRect /// @param hContext - Handle passed back from SwrCreateContext /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. -/// @param rect - if rect is all zeros, the entire attachment surface will be discarded -void SwrDiscardRect( +/// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be +/// discarded. +void SWR_API SwrDiscardRect( HANDLE hContext, uint32_t attachmentMask, - SWR_RECT rect) + const SWR_RECT& rect) { if (KNOB_TOSS_DRAW) { @@ -1324,12 +1431,15 @@ void SwrDiscardRect( pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; pDC->FeWork.desc.discardInvalidateTiles.rect = rect; + pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect; pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true; pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; //enqueue QueueDraw(pContext); + + AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId)); } ////////////////////////////////////////////////////////////////////////// @@ -1349,10 +1459,11 @@ void SwrDispatch( return; } - RDTSC_START(APIDispatch); SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); + AR_API_BEGIN(APIDispatch, pDC->drawId); + AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ)); pDC->isCompute = true; // This is a compute context. COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); @@ -1364,68 +1475,79 @@ void SwrDispatch( uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; - pDC->pDispatch->initialize(totalThreadGroups, pTaskData); + pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE); QueueDispatch(pContext); - RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0); + AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ); } // Deswizzles, converts and stores current contents of the hot tiles to surface // described by pState -void SwrStoreTiles( +void SWR_API SwrStoreTiles( HANDLE hContext, - SWR_RENDERTARGET_ATTACHMENT attachment, - SWR_TILE_STATE postStoreTileState) + uint32_t attachmentMask, + SWR_TILE_STATE postStoreTileState, + const SWR_RECT& storeRect) { if (KNOB_TOSS_DRAW) { return; } - RDTSC_START(APIStoreTiles); - SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); - SetupMacroTileScissors(pDC); + AR_API_BEGIN(APIStoreTiles, pDC->drawId); pDC->FeWork.type = STORETILES; pDC->FeWork.pfnWork = ProcessStoreTiles; - pDC->FeWork.desc.storeTiles.attachment = attachment; + pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask; pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; + pDC->FeWork.desc.storeTiles.rect = storeRect; + pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect; //enqueue QueueDraw(pContext); - RDTSC_STOP(APIStoreTiles, 0, 0); + AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId)); + + AR_API_END(APIStoreTiles, 1); } -void SwrClearRenderTarget( +////////////////////////////////////////////////////////////////////////// +/// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear +/// @param renderTargetArrayIndex - the RT array index to clear +/// @param clearColor - color use for clearing render targets +/// @param z - depth value use for clearing depth buffer +/// @param stencil - stencil value used for clearing stencil buffer +/// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers +void SWR_API SwrClearRenderTarget( HANDLE hContext, - uint32_t clearMask, + uint32_t attachmentMask, + uint32_t renderTargetArrayIndex, const float clearColor[4], float z, - uint8_t stencil) + uint8_t stencil, + const SWR_RECT& clearRect) { if (KNOB_TOSS_DRAW) { return; } - RDTSC_START(APIClearRenderTarget); - SWR_CONTEXT *pContext = GetContext(hContext); - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - SetupMacroTileScissors(pDC); - - CLEAR_FLAGS flags; - flags.mask = clearMask; + AR_API_BEGIN(APIClearRenderTarget, pDC->drawId); pDC->FeWork.type = CLEAR; pDC->FeWork.pfnWork = ProcessClear; - pDC->FeWork.desc.clear.flags = flags; + pDC->FeWork.desc.clear.rect = clearRect; + pDC->FeWork.desc.clear.rect &= g_MaxScissorRect; + pDC->FeWork.desc.clear.attachmentMask = attachmentMask; + pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex; pDC->FeWork.desc.clear.clearDepth = z; pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; @@ -1436,7 +1558,7 @@ void SwrClearRenderTarget( // enqueue draw QueueDraw(pContext); - RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId); + AR_API_END(APIClearRenderTarget, 1); } ////////////////////////////////////////////////////////////////////////// @@ -1480,44 +1602,31 @@ VOID* SwrAllocDrawContextMemory( } ////////////////////////////////////////////////////////////////////////// -/// @brief Returns pointer to SWR stats. -/// @note The counters are atomically incremented by multiple threads. -/// When calling this, you need to ensure all previous operations -/// have completed. -/// @todo If necessary, add a callback to avoid stalling the pipe to -/// sample the counters. +/// @brief Enables stats counting /// @param hContext - Handle passed back from SwrCreateContext -/// @param pStats - SWR will fill this out for caller. -void SwrGetStats( +/// @param enable - If true then counts are incremented. +void SwrEnableStatsFE( HANDLE hContext, - SWR_STATS* pStats) + bool enable) { SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); - pDC->FeWork.type = QUERYSTATS; - pDC->FeWork.pfnWork = ProcessQueryStats; - pDC->FeWork.desc.queryStats.pStats = pStats; - - // cannot execute until all previous draws have completed - pDC->dependent = true; - - //enqueue - QueueDraw(pContext); + pDC->pState->state.enableStatsFE = enable; } ////////////////////////////////////////////////////////////////////////// /// @brief Enables stats counting /// @param hContext - Handle passed back from SwrCreateContext /// @param enable - If true then counts are incremented. -void SwrEnableStats( +void SwrEnableStatsBE( HANDLE hContext, bool enable) { SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); - pDC->pState->state.enableStats = enable; + pDC->pState->state.enableStatsBE = enable; } ////////////////////////////////////////////////////////////////////////// @@ -1526,7 +1635,84 @@ void SwrEnableStats( void SWR_API SwrEndFrame( HANDLE hContext) { - RDTSC_ENDFRAME(); SWR_CONTEXT *pContext = GetContext(hContext); + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + (void)pDC; // var used + + RDTSC_ENDFRAME(); + AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId)); + pContext->frameCount++; } + +void InitSimLoadTilesTable(); +void InitSimStoreTilesTable(); +void InitSimClearTilesTable(); + +void InitClearTilesTable(); +void InitBackendFuncTables(); + +////////////////////////////////////////////////////////////////////////// +/// @brief Initialize swr backend and memory internal tables +void SwrInit() +{ + InitSimLoadTilesTable(); + InitSimStoreTilesTable(); + InitSimClearTilesTable(); + + InitClearTilesTable(); + InitBackendFuncTables(); + InitRasterizerFunctions(); +} + +void SwrGetInterface(SWR_INTERFACE &out_funcs) +{ + out_funcs.pfnSwrCreateContext = SwrCreateContext; + out_funcs.pfnSwrDestroyContext = SwrDestroyContext; + out_funcs.pfnSwrSaveState = SwrSaveState; + out_funcs.pfnSwrRestoreState = SwrRestoreState; + out_funcs.pfnSwrSync = SwrSync; + out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle; + out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE; + out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers; + out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer; + out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc; + out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc; + out_funcs.pfnSwrSetSoState = SwrSetSoState; + out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers; + out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc; + out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState; + out_funcs.pfnSwrSetGsState = SwrSetGsState; + out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc; + out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc; + out_funcs.pfnSwrSetTsState = SwrSetTsState; + out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc; + out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc; + out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState; + out_funcs.pfnSwrSetBackendState = SwrSetBackendState; + out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState; + out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState; + out_funcs.pfnSwrSetBlendState = SwrSetBlendState; + out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc; + out_funcs.pfnSwrDraw = SwrDraw; + out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced; + out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed; + out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced; + out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles; + out_funcs.pfnSwrDiscardRect = SwrDiscardRect; + out_funcs.pfnSwrDispatch = SwrDispatch; + out_funcs.pfnSwrStoreTiles = SwrStoreTiles; + out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget; + out_funcs.pfnSwrSetRastState = SwrSetRastState; + out_funcs.pfnSwrSetViewports = SwrSetViewports; + out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects; + out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState; + out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory; + out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE; + out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE; + out_funcs.pfnSwrEndFrame = SwrEndFrame; + out_funcs.pfnSwrInit = SwrInit; + out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile; + out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface; + out_funcs.pfnSwrStoreHotTileClear = SwrStoreHotTileClear; +}