/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file frontend.cpp
-*
-* @brief Implementation for Frontend which handles vertex processing,
-* primitive assembly, clipping, binning, etc.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file frontend.cpp
+ *
+ * @brief Implementation for Frontend which handles vertex processing,
+ * primitive assembly, clipping, binning, etc.
+ *
+ ******************************************************************************/
#include "api.h"
#include "frontend.h"
#include "backend.h"
#include "context.h"
#include "rdtsc_core.h"
-#include "rasterizer.h"
-#include "conservativeRast.h"
#include "utils.h"
#include "threads.h"
#include "pa.h"
#include "tilemgr.h"
#include "tessellator.h"
#include <limits>
+#include <iostream>
//////////////////////////////////////////////////////////////////////////
/// @brief Helper macro to generate a bitmask
static INLINE uint32_t GenMask(uint32_t numBits)
{
- SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
+ SWR_ASSERT(
+ numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
return ((1U << numBits) - 1);
}
-//////////////////////////////////////////////////////////////////////////
-/// @brief Offsets added to post-viewport vertex positions based on
-/// raster state.
-static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
-{
- _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
- _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
-};
-
//////////////////////////////////////////////////////////////////////////
/// @brief FE handler for SwrSync.
/// @param pContext - pointer to SWR context.
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pUserData - Pointer to user data passed back to sync callback.
/// @todo This should go away when we switch this to use compute threading.
-void ProcessSync(
- SWR_CONTEXT *pContext,
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- void *pUserData)
+void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
{
BE_WORK work;
- work.type = SYNC;
+ work.type = SYNC;
work.pfnWork = ProcessSyncBE;
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
+ MacroTileMgr* pTileMgr = pDC->pTileMgr;
pTileMgr->enqueue(0, 0, &work);
}
//////////////////////////////////////////////////////////////////////////
-/// @brief FE handler for SwrGetStats.
+/// @brief FE handler for SwrDestroyContext.
/// @param pContext - pointer to SWR context.
/// @param pDC - pointer to draw context.
/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param pUserData - Pointer to user data passed back to stats callback.
-/// @todo This should go away when we switch this to use compute threading.
-void ProcessQueryStats(
- SWR_CONTEXT *pContext,
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- void *pUserData)
+/// @param pUserData - Pointer to user data passed back to sync callback.
+void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
{
- QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
BE_WORK work;
- work.type = QUERYSTATS;
- work.pfnWork = ProcessQueryStatsBE;
- work.desc.queryStats = *pQueryStats;
+ work.type = SHUTDOWN;
+ work.pfnWork = ProcessShutdownBE;
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
- pTileMgr->enqueue(0, 0, &work);
+ MacroTileMgr* pTileMgr = pDC->pTileMgr;
+ // Enqueue at least 1 work item for each worker thread
+ // account for number of numa nodes
+ uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
+
+ for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
+ {
+ for (uint32_t n = 0; n < numNumaNodes; ++n)
+ {
+ pTileMgr->enqueue(i, n, &work);
+ }
+ }
}
//////////////////////////////////////////////////////////////////////////
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pUserData - Pointer to user data passed back to clear callback.
/// @todo This should go away when we switch this to use compute threading.
-void ProcessClear(
- SWR_CONTEXT *pContext,
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- void *pUserData)
+void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
{
- CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
- const API_STATE& state = GetApiState(pDC);
+ CLEAR_DESC* pDesc = (CLEAR_DESC*)pUserData;
+ MacroTileMgr* pTileMgr = pDC->pTileMgr;
// queue a clear to each macro tile
- // compute macro tile bounds for the current scissor/viewport
- uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
- uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
- uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
- uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
+ // compute macro tile bounds for the specified rect
+ uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+ uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
BE_WORK work;
- work.type = CLEAR;
- work.pfnWork = ProcessClearBE;
- work.desc.clear = *pClear;
+ work.type = CLEAR;
+ work.pfnWork = ProcessClearBE;
+ work.desc.clear = *pDesc;
- for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
+ for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
{
- for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
+ for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
{
pTileMgr->enqueue(x, y, &work);
}
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pUserData - Pointer to user data passed back to callback.
/// @todo This should go away when we switch this to use compute threading.
-void ProcessStoreTiles(
- SWR_CONTEXT *pContext,
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- void *pUserData)
+void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
{
- RDTSC_START(FEProcessStoreTiles);
- STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
- const API_STATE& state = GetApiState(pDC);
+ RDTSC_BEGIN(pContext->pBucketMgr, FEProcessStoreTiles, pDC->drawId);
+ MacroTileMgr* pTileMgr = pDC->pTileMgr;
+ STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
// queue a store to each macro tile
- // compute macro tile bounds for the current render target
- const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
- const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-
- uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
- uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+ // compute macro tile bounds for the specified rect
+ uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+ uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
// store tiles
BE_WORK work;
- work.type = STORETILES;
- work.pfnWork = ProcessStoreTileBE;
- work.desc.storeTiles = *pStore;
+ work.type = STORETILES;
+ work.pfnWork = ProcessStoreTilesBE;
+ work.desc.storeTiles = *pDesc;
- for (uint32_t x = 0; x < numMacroTilesX; ++x)
+ for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
{
- for (uint32_t y = 0; y < numMacroTilesY; ++y)
+ for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
{
pTileMgr->enqueue(x, y, &work);
}
}
- RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
+ RDTSC_END(pContext->pBucketMgr, FEProcessStoreTiles, 0);
}
//////////////////////////////////////////////////////////////////////////
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pUserData - Pointer to user data passed back to callback.
/// @todo This should go away when we switch this to use compute threading.
-void ProcessDiscardInvalidateTiles(
- SWR_CONTEXT *pContext,
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- void *pUserData)
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT* pContext,
+ DRAW_CONTEXT* pDC,
+ uint32_t workerId,
+ void* pUserData)
{
- RDTSC_START(FEProcessInvalidateTiles);
- DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
- SWR_RECT rect;
-
- if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
- {
- // Valid rect
- rect = pInv->rect;
- }
- else
- {
- // Use viewport dimensions
- const API_STATE& state = GetApiState(pDC);
-
- rect.left = (uint32_t)state.vp[0].x;
- rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
- rect.top = (uint32_t)state.vp[0].y;
- rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
- }
-
- // queue a store to each macro tile
- // compute macro tile bounds for the current render target
- uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
- uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
-
- // Setup region assuming full tiles
- uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
- uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
+ RDTSC_BEGIN(pContext->pBucketMgr, FEProcessInvalidateTiles, pDC->drawId);
+ DISCARD_INVALIDATE_TILES_DESC* pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
+ MacroTileMgr* pTileMgr = pDC->pTileMgr;
- uint32_t macroTileEndX = rect.right / macroWidth;
- uint32_t macroTileEndY = rect.bottom / macroHeight;
+ // compute macro tile bounds for the specified rect
+ uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
+ uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
+ uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
+ uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
- if (pInv->fullTilesOnly == false)
+ if (pDesc->fullTilesOnly == false)
{
// include partial tiles
- macroTileStartX = rect.left / macroWidth;
- macroTileStartY = rect.top / macroHeight;
-
- macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
- macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+ macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
+ macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
+ macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
+ macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
}
- SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
- SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
+ SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
- macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
- macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
+ macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
+ macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
// load tiles
BE_WORK work;
- work.type = DISCARDINVALIDATETILES;
- work.pfnWork = ProcessDiscardInvalidateTilesBE;
- work.desc.discardInvalidateTiles = *pInv;
+ work.type = DISCARDINVALIDATETILES;
+ work.pfnWork = ProcessDiscardInvalidateTilesBE;
+ work.desc.discardInvalidateTiles = *pDesc;
- for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
+ for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
{
- for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
+ for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
{
pTileMgr->enqueue(x, y, &work);
}
}
- RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
+ RDTSC_END(pContext->pBucketMgr, FEProcessInvalidateTiles, 0);
}
//////////////////////////////////////////////////////////////////////////
/// @param mode - primitive topology for draw operation.
/// @param numPrims - number of vertices or indices for draw.
/// @todo Frontend needs to be refactored. This will go in appropriate place then.
-uint32_t GetNumPrims(
- PRIMITIVE_TOPOLOGY mode,
- uint32_t numPrims)
+uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
{
switch (mode)
{
- case TOP_POINT_LIST: return numPrims;
- case TOP_TRIANGLE_LIST: return numPrims / 3;
- case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
- case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
- case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
- case TOP_QUAD_LIST: return numPrims / 4;
- case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
- case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
- case TOP_LINE_LIST: return numPrims / 2;
- case TOP_LINE_LOOP: return numPrims;
- case TOP_RECT_LIST: return numPrims / 3;
- case TOP_LINE_LIST_ADJ: return numPrims / 4;
- case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
- case TOP_TRI_LIST_ADJ: return numPrims / 6;
- case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
+ case TOP_POINT_LIST:
+ return numPrims;
+ case TOP_TRIANGLE_LIST:
+ return numPrims / 3;
+ case TOP_TRIANGLE_STRIP:
+ return numPrims < 3 ? 0 : numPrims - 2;
+ case TOP_TRIANGLE_FAN:
+ return numPrims < 3 ? 0 : numPrims - 2;
+ case TOP_TRIANGLE_DISC:
+ return numPrims < 2 ? 0 : numPrims - 1;
+ case TOP_QUAD_LIST:
+ return numPrims / 4;
+ case TOP_QUAD_STRIP:
+ return numPrims < 4 ? 0 : (numPrims - 2) / 2;
+ case TOP_LINE_STRIP:
+ return numPrims < 2 ? 0 : numPrims - 1;
+ case TOP_LINE_LIST:
+ return numPrims / 2;
+ case TOP_LINE_LOOP:
+ return numPrims;
+ case TOP_RECT_LIST:
+ return numPrims / 3;
+ case TOP_LINE_LIST_ADJ:
+ return numPrims / 4;
+ case TOP_LISTSTRIP_ADJ:
+ return numPrims < 3 ? 0 : numPrims - 3;
+ case TOP_TRI_LIST_ADJ:
+ return numPrims / 6;
+ case TOP_TRI_STRIP_ADJ:
+ return numPrims < 4 ? 0 : (numPrims / 2) - 2;
case TOP_PATCHLIST_1:
case TOP_PATCHLIST_2:
case TOP_TRI_STRIP_REVERSE:
case TOP_PATCHLIST_BASE:
case TOP_UNKNOWN:
- SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ SWR_INVALID("Unsupported topology: %d", mode);
return 0;
}
/// @brief Computes the number of verts given the number of primitives.
/// @param mode - primitive topology for draw operation.
/// @param numPrims - number of primitives for draw.
-uint32_t GetNumVerts(
- PRIMITIVE_TOPOLOGY mode,
- uint32_t numPrims)
+uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
{
switch (mode)
{
- case TOP_POINT_LIST: return numPrims;
- case TOP_TRIANGLE_LIST: return numPrims * 3;
- case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
- case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
- case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
- case TOP_QUAD_LIST: return numPrims * 4;
- case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
- case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
- case TOP_LINE_LIST: return numPrims * 2;
- case TOP_LINE_LOOP: return numPrims;
- case TOP_RECT_LIST: return numPrims * 3;
- case TOP_LINE_LIST_ADJ: return numPrims * 4;
- case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
- case TOP_TRI_LIST_ADJ: return numPrims * 6;
- case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
+ case TOP_POINT_LIST:
+ return numPrims;
+ case TOP_TRIANGLE_LIST:
+ return numPrims * 3;
+ case TOP_TRIANGLE_STRIP:
+ return numPrims ? numPrims + 2 : 0;
+ case TOP_TRIANGLE_FAN:
+ return numPrims ? numPrims + 2 : 0;
+ case TOP_TRIANGLE_DISC:
+ return numPrims ? numPrims + 1 : 0;
+ case TOP_QUAD_LIST:
+ return numPrims * 4;
+ case TOP_QUAD_STRIP:
+ return numPrims ? numPrims * 2 + 2 : 0;
+ case TOP_LINE_STRIP:
+ return numPrims ? numPrims + 1 : 0;
+ case TOP_LINE_LIST:
+ return numPrims * 2;
+ case TOP_LINE_LOOP:
+ return numPrims;
+ case TOP_RECT_LIST:
+ return numPrims * 3;
+ case TOP_LINE_LIST_ADJ:
+ return numPrims * 4;
+ case TOP_LISTSTRIP_ADJ:
+ return numPrims ? numPrims + 3 : 0;
+ case TOP_TRI_LIST_ADJ:
+ return numPrims * 6;
+ case TOP_TRI_STRIP_ADJ:
+ return numPrims ? (numPrims + 2) * 2 : 0;
case TOP_PATCHLIST_1:
case TOP_PATCHLIST_2:
case TOP_TRI_STRIP_REVERSE:
case TOP_PATCHLIST_BASE:
case TOP_UNKNOWN:
- SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ SWR_INVALID("Unsupported topology: %d", mode);
return 0;
}
numVerts = topology - TOP_PATCHLIST_BASE;
break;
default:
- SWR_ASSERT(false, "Unsupported topology: %d", topology);
+ SWR_INVALID("Unsupported topology: %d", topology);
break;
}
switch (topology)
{
case TOP_LISTSTRIP_ADJ:
- case TOP_LINE_LIST_ADJ: numVerts = 4; break;
+ case TOP_LINE_LIST_ADJ:
+ numVerts = 4;
+ break;
case TOP_TRI_STRIP_ADJ:
- case TOP_TRI_LIST_ADJ: numVerts = 6; break;
- default: break;
+ case TOP_TRI_LIST_ADJ:
+ numVerts = 6;
+ break;
+ default:
+ break;
}
}
/// @param numWorkItems - Number of items being worked on by a SIMD.
static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
{
- uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
+ uint32_t numActive =
+ (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
+ uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
+ return _simd_castps_si(_simd_vmask_ps(mask));
+}
+
+static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
+{
+ uint32_t numActive =
+ (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
- return _simd_castps_si(vMask(mask));
+ return _simd16_castps_si(_simd16_vmask_ps(mask));
}
//////////////////////////////////////////////////////////////////////////
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
static void StreamOut(
- DRAW_CONTEXT* pDC,
- PA_STATE& pa,
- uint32_t workerId,
- uint32_t* pPrimData,
- uint32_t streamIndex)
+ DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
{
- RDTSC_START(FEStreamout);
-
- SWR_CONTEXT* pContext = pDC->pContext;
+ RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
- const API_STATE& state = GetApiState(pDC);
- const SWR_STREAMOUT_STATE &soState = state.soState;
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_STREAMOUT_STATE& soState = state.soState;
uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
- // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
- uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
+ // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
+ // vertex.
+ uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
- SWR_STREAMOUT_CONTEXT soContext = { 0 };
+ SWR_STREAMOUT_CONTEXT soContext = {0};
// Setup buffer state pointers.
for (uint32_t i = 0; i < 4; ++i)
}
uint32_t numPrims = pa.NumPrims();
+
for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
{
- DWORD slot = 0;
- uint32_t soMask = soState.streamMasks[streamIndex];
+ DWORD slot = 0;
+ uint64_t soMask = soState.streamMasks[streamIndex];
// Write all entries into primitive data buffer for SOS.
- while (_BitScanForward(&slot, soMask))
+ while (_BitScanForward64(&slot, soMask))
{
- __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
- uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
+ simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
+ uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
pa.AssembleSingle(paSlot, primIndex, attrib);
// Attribute offset is relative offset from start of vertex.
// Store each vertex's attrib at appropriate locations in pPrimData buffer.
for (uint32_t v = 0; v < soVertsPerPrim; ++v)
{
- uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
+ uint32_t* pPrimDataAttrib =
+ pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
_mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
}
- soMask &= ~(1 << slot);
+
+ soMask &= ~(uint64_t(1) << slot);
}
- // Update pPrimData pointer
+ // Update pPrimData pointer
soContext.pPrimData = pPrimData;
// Call SOS
- SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
- state.pfnSoFunc[streamIndex](soContext);
+ SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
+ "Trying to execute uninitialized streamout jit function.");
+ state.pfnSoFunc[streamIndex](GetPrivateState(pDC), soContext);
}
// Update SO write offset. The driver provides memory for the update.
{
if (state.soBuffer[i].pWriteOffset)
{
- *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
+ bool nullTileAccessed = false;
+ void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(
+ GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed);
+ *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
}
if (state.soBuffer[i].soWriteEnable)
}
}
- UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
- UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+ UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
+ UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+
+ RDTSC_END(pDC->pContext->pBucketMgr, FEStreamout, 1);
+}
+
+#if USE_SIMD16_FRONTEND
+//////////////////////////////////////////////////////////////////////////
+/// Is value an even number (a multiple of two)
+///
+template <typename T>
+INLINE static bool IsEven(T value)
+{
+ return (value & 1) == 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round up value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundUpEven(T value)
+{
+ return (value + 1) & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round down value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundDownEven(T value)
+{
+ return value & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
+///
+/// vertexCount is in terms of the source simdvertexes and must be even
+///
+/// attribCount will limit the vector copies to those attribs specified
+///
+/// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
+///
+void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex* vertex_simd16,
+ const simdvertex* vertex,
+ uint32_t vertexCount,
+ uint32_t attribCount)
+{
+ SWR_ASSERT(vertex);
+ SWR_ASSERT(vertex_simd16);
+ SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
+
+ simd16vertex temp;
+
+ for (uint32_t i = 0; i < vertexCount; i += 2)
+ {
+ for (uint32_t j = 0; j < attribCount; j += 1)
+ {
+ for (uint32_t k = 0; k < 4; k += 1)
+ {
+ temp.attrib[j][k] =
+ _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
+
+ if ((i + 1) < vertexCount)
+ {
+ temp.attrib[j][k] =
+ _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
+ }
+ }
+ }
- RDTSC_STOP(FEStreamout, 1, 0);
+ for (uint32_t j = 0; j < attribCount; j += 1)
+ {
+ vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
+ }
+ }
}
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Computes number of invocations. The current index represents
/// the start of the SIMD. The max index represents how much work
-/// items are remaining. If there is less then a SIMD's left of work
+/// items are remaining. If there is less then a SIMD's xmin of work
/// then return the remaining amount of work.
/// @param curIndex - The start index for the SIMD.
/// @param maxIndex - The last index for all work items.
-static INLINE uint32_t GetNumInvocations(
- uint32_t curIndex,
- uint32_t maxIndex)
+static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
{
uint32_t remainder = (maxIndex - curIndex);
+#if USE_SIMD16_FRONTEND
+ return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
+#else
return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
+#endif
}
//////////////////////////////////////////////////////////////////////////
/// @param pStreamIdBase - pointer to the stream ID buffer
/// @param numEmittedVerts - Number of total verts emitted by the GS
/// @param pCutBuffer - output buffer to write cuts to
-void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
+void ProcessStreamIdBuffer(uint32_t stream,
+ uint8_t* pStreamIdBase,
+ uint32_t numEmittedVerts,
+ uint8_t* pCutBuffer)
{
SWR_ASSERT(stream < MAX_SO_STREAMS);
- uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
+ uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8;
uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
for (uint32_t b = 0; b < numOutputBytes; ++b)
{
- uint8_t curInputByte = pStreamIdBase[2*b];
- uint8_t outByte = 0;
+ uint8_t curInputByte = pStreamIdBase[2 * b];
+ uint8_t outByte = 0;
for (uint32_t i = 0; i < 4; ++i)
{
if ((curInputByte & 0x3) != stream)
}
curInputByte >>= 2;
}
-
+
*pCutBuffer++ = outByte;
}
}
-THREAD SWR_GS_CONTEXT tlsGsContext;
+// Buffers that are allocated if GS is enabled
+struct GsBuffers
+{
+ uint8_t* pGsIn;
+ uint8_t* pGsOut[KNOB_SIMD_WIDTH];
+ uint8_t* pGsTransposed;
+ void* pStreamCutBuffer;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
+/// assembler
+/// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
+/// @param numVerts - Number of vertices outputted by the GS
+/// @param numAttribs - Number of attributes per vertex
+template <typename SIMD_T, uint32_t SimdWidth>
+void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
+{
+ uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
+ uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
+
+ OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
+
+ for (uint32_t i = 0; i < SimdWidth; ++i)
+ {
+ gatherOffsets[i] = srcVertexStride * i;
+ }
+ auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
+
+ uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+ uint32_t remainingVerts = numVerts;
+
+ for (uint32_t s = 0; s < numSimd; ++s)
+ {
+ uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
+ uint8_t* pDstBase = pDst + s * dstVertexStride;
+
+ // Compute mask to prevent src overflow
+ uint32_t mask = std::min(remainingVerts, SimdWidth);
+ mask = GenMask(mask);
+ auto vMask = SIMD_T::vmask_ps(mask);
+ auto viMask = SIMD_T::castps_si(vMask);
+
+ for (uint32_t a = 0; a < numAttribs; ++a)
+ {
+ auto attribGatherX = SIMD_T::mask_i32gather_ps(
+ SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
+ auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+ (const float*)(pSrcBase + sizeof(float)),
+ vGatherOffsets,
+ vMask);
+ auto attribGatherZ =
+ SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+ (const float*)(pSrcBase + sizeof(float) * 2),
+ vGatherOffsets,
+ vMask);
+ auto attribGatherW =
+ SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+ (const float*)(pSrcBase + sizeof(float) * 3),
+ vGatherOffsets,
+ vMask);
+
+ SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
+ SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
+ SIMD_T::maskstore_ps(
+ (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
+ SIMD_T::maskstore_ps(
+ (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
+
+ pSrcBase += sizeof(float) * 4;
+ pDstBase += sizeof(Float<SIMD_T>) * 4;
+ }
+ remainingVerts -= SimdWidth;
+ }
+}
+
//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pa - The primitive assembly object.
/// @param pGsOut - output stream for GS
-template <
- typename HasStreamOutT,
- typename HasRastT>
-static void GeometryShaderStage(
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- PA_STATE& pa,
- void* pGsOut,
- void* pCutBuffer,
- void* pStreamCutBuffer,
- uint32_t* pSoPrimData,
- simdscalari primID)
+template <typename HasStreamOutT, typename HasRastT>
+static void GeometryShaderStage(DRAW_CONTEXT* pDC,
+ uint32_t workerId,
+ PA_STATE& pa,
+ GsBuffers* pGsBuffers,
+ uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims_simd8,
+#endif
+ simdscalari const& primID)
{
- RDTSC_START(FEGeometryShader);
+ RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEGeometryShader, pDC->drawId);
- SWR_CONTEXT* pContext = pDC->pContext;
+ void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
- const API_STATE& state = GetApiState(pDC);
+ const API_STATE& state = GetApiState(pDC);
const SWR_GS_STATE* pState = &state.gsState;
+ SWR_GS_CONTEXT gsContext;
- SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
- SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
+ static uint8_t sNullBuffer[128] = {0};
- tlsGsContext.pStream = (uint8_t*)pGsOut;
- tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
- tlsGsContext.PrimitiveID = primID;
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
+ }
+ gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
+ gsContext.PrimitiveID = primID;
- uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
- simdvector attrib[MAX_ATTRIBUTES];
+ uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
+ simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
// assemble all attributes for the input primitive
+ gsContext.inputVertStride = pState->inputVertStride;
for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
- pa.Assemble(attribSlot, attrib);
+ uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
+ uint32_t attribSlot = pState->vertexAttribOffset + slot;
+ pa.Assemble(srcAttribSlot, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
+ gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
}
}
-
+
// assemble position
pa.Assemble(VERTEX_POSITION_SLOT, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
- }
-
- const uint32_t vertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
- const uint32_t inputPrimStride = numSimdBatches * vertexStride;
- const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
- uint32_t cutPrimStride;
- uint32_t cutInstanceStride;
-
- if (pState->isSingleStream)
- {
- cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
- cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
- }
- else
- {
- cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
- cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
+ gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
}
// record valid prims from the frontend to avoid over binning the newly generated
// prims from the GS
- uint32_t numInputPrims = pa.NumPrims();
+#if USE_SIMD16_FRONTEND
+ uint32_t numInputPrims = numPrims_simd8;
+#else
+ uint32_t numInputPrims = pa.NumPrims();
+#endif
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
- tlsGsContext.InstanceID = instance;
- tlsGsContext.mask = GenerateMask(numInputPrims);
+ gsContext.InstanceID = instance;
+ gsContext.mask = GenerateMask(numInputPrims);
// execute the geometry shader
- state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
+ state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
+ AR_EVENT(GSStats((HANDLE)&gsContext.stats));
- tlsGsContext.pStream += instanceStride;
- tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ gsContext.pStreams[i] += pState->allocationSize;
+ }
}
// set up new binner and state for the GS output topology
- PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (pState->outputTopology)
+ {
+ case TOP_RECT_LIST:
+ pfnClipFunc = ClipRectangles_simd16;
+ break;
+ case TOP_TRIANGLE_STRIP:
+ pfnClipFunc = ClipTriangles_simd16;
+ break;
+ case TOP_LINE_STRIP:
+ pfnClipFunc = ClipLines_simd16;
+ break;
+ case TOP_POINT_LIST:
+ pfnClipFunc = ClipPoints_simd16;
+ break;
+ default:
+ SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+ }
+ }
+
+#else
+ PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
switch (pState->outputTopology)
{
- case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
- case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
- case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
- default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
+ case TOP_RECT_LIST:
+ pfnClipFunc = ClipRectangles;
+ break;
+ case TOP_TRIANGLE_STRIP:
+ pfnClipFunc = ClipTriangles;
+ break;
+ case TOP_LINE_STRIP:
+ pfnClipFunc = ClipLines;
+ break;
+ case TOP_POINT_LIST:
+ pfnClipFunc = ClipPoints;
+ break;
+ default:
+ SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
}
}
+#endif
// foreach input prim:
// - setup a new PA based on the emitted verts for that prim
// - loop over the new verts, calling PA to assemble each prim
- uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
uint32_t* pPrimitiveId = (uint32_t*)&primID;
uint32_t totalPrimsGenerated = 0;
for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
{
- uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
- uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
+ uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];
+
+ // Vertex count is either emitted by shader or static
+ uint32_t vertexCount = 0;
+ if (pState->staticVertexCount)
+ {
+ vertexCount = pState->staticVertexCount;
+ }
+ else
+ {
+ // If emitted in shader, it should be the stored in the first dword of the output buffer
+ vertexCount = *(uint32_t*)pInstanceBase;
+ }
+
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
- uint32_t numEmittedVerts = pVertexCount[inputPrim];
+ uint32_t numEmittedVerts = vertexCount;
if (numEmittedVerts == 0)
{
continue;
}
- uint8_t* pBase = pInstanceBase + instance * instanceStride;
- uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
-
+ uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
+ uint8_t* pCutBase =
+ pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
+ uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
+
+#if USE_SIMD16_FRONTEND
+ TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
+ pVertexBaseAOS,
+ vertexCount,
+ pState->outputVertexSize);
+#else
+ TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
+ pVertexBaseAOS,
+ vertexCount,
+ pState->outputVertexSize);
+#endif
+
uint32_t numAttribs = state.feNumAttributes;
for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
{
- bool processCutVerts = false;
-
- uint8_t* pCutBuffer = pCutBase;
+ bool processCutVerts = false;
+ uint8_t* pCutBuffer = pCutBase;
// assign default stream ID, only relevant when GS is outputting a single stream
uint32_t streamID = 0;
if (pState->isSingleStream)
{
processCutVerts = true;
- streamID = pState->singleStreamID;
- if (streamID != stream) continue;
+ streamID = pState->singleStreamID;
+ if (streamID != stream)
+ continue;
}
else
{
}
// multi-stream output, need to translate StreamID buffer to a cut buffer
- ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
- pCutBuffer = (uint8_t*)pStreamCutBuffer;
+ ProcessStreamIdBuffer(
+ stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
+ pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
processCutVerts = false;
}
- PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+#if USE_SIMD16_FRONTEND
+ PA_STATE_CUT gsPa(pDC,
+ (uint8_t*)pGsBuffers->pGsTransposed,
+ numEmittedVerts,
+ pState->outputVertexSize,
+ reinterpret_cast<simd16mask*>(pCutBuffer),
+ numEmittedVerts,
+ numAttribs,
+ pState->outputTopology,
+ processCutVerts,
+ pa.numVertsPerPrim);
+
+#else
+ PA_STATE_CUT gsPa(pDC,
+ (uint8_t*)pGsBuffers->pGsTransposed,
+ numEmittedVerts,
+ pState->outputVertexSize,
+ pCutBuffer,
+ numEmittedVerts,
+ numAttribs,
+ pState->outputTopology,
+ processCutVerts,
+ pa.numVertsPerPrim);
+#endif
while (gsPa.GetNextStreamOutput())
{
do
{
+#if USE_SIMD16_FRONTEND
+ simd16vector attrib_simd16[3];
+
+ bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
+
+#else
bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+#endif
if (assemble)
{
totalPrimsGenerated += gsPa.NumPrims();
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
+ gsPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
}
if (HasRastT::value && state.soState.streamToRasterizer == stream)
{
- simdscalari vPrimId;
- // pull primitiveID from the GS output if available
- if (state.gsState.emitsPrimitiveID)
+#if USE_SIMD16_FRONTEND
+ simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
+
+ // Gather data from the SVG if provided.
+ simd16scalari vViewportIdx = SIMD16::setzero_si();
+ simd16scalari vRtIdx = SIMD16::setzero_si();
+ SIMD16::Vec4 svgAttrib[4];
+
+ if (state.backendState.readViewportArrayIndex ||
+ state.backendState.readRenderTargetArrayIndex)
+ {
+ gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+ }
+
+ if (state.backendState.readViewportArrayIndex)
{
- simdvector primIdAttrib[3];
- gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
- vPrimId = _simd_castps_si(primIdAttrib[0].x);
+ vViewportIdx =
+ SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+ gsPa.viewportArrayActive = true;
}
- else
+ if (state.backendState.readRenderTargetArrayIndex)
{
- vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
+ vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+ gsPa.rtArrayActive = true;
}
- pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+ {
+ // OOB VPAI indices => forced to zero.
+ vViewportIdx =
+ SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
+ simd16scalari vNumViewports =
+ SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simd16scalari vClearMask =
+ SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
+ vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
+
+ gsPa.useAlternateOffset = false;
+ pfnClipFunc(pDC,
+ gsPa,
+ workerId,
+ attrib_simd16,
+ GenMask(gsPa.NumPrims()),
+ vPrimId,
+ vViewportIdx,
+ vRtIdx);
+ }
+#else
+ simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
+
+ // Gather data from the SVG if provided.
+ simdscalari vViewportIdx = SIMD::setzero_si();
+ simdscalari vRtIdx = SIMD::setzero_si();
+ SIMD::Vec4 svgAttrib[4];
+
+ if (state.backendState.readViewportArrayIndex ||
+ state.backendState.readRenderTargetArrayIndex)
+ {
+ gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+ }
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ vViewportIdx =
+ SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+ // OOB VPAI indices => forced to zero.
+ vViewportIdx =
+ SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+ simdscalari vNumViewports =
+ SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simdscalari vClearMask =
+ SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+ vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+ gsPa.viewportArrayActive = true;
+ }
+ if (state.backendState.readRenderTargetArrayIndex)
+ {
+ vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+ gsPa.rtArrayActive = true;
+ }
+
+ pfnClipFunc(pDC,
+ gsPa,
+ workerId,
+ attrib,
+ GenMask(gsPa.NumPrims()),
+ vPrimId,
+ vViewportIdx,
+ vRtIdx);
+#endif
}
}
} while (gsPa.NextPrim());
}
// update GS pipeline stats
- UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
- UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
-
- RDTSC_STOP(FEGeometryShader, 1, 0);
+ UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
+ UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
+ AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
+ RDTSC_END(pDC->pContext->pBucketMgr, FEGeometryShader, 1);
}
//////////////////////////////////////////////////////////////////////////
/// @param state - API state
/// @param ppGsOut - pointer to GS output buffer allocation
/// @param ppCutBuffer - pointer to GS output cut buffer allocation
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
- void **ppStreamCutBuffer)
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC,
+ const API_STATE& state,
+ uint32_t vertsPerPrim,
+ GsBuffers* pGsBuffers)
{
auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
SWR_ASSERT(state.gsState.gsEnable);
- // allocate arena space to hold GS output verts
- // @todo pack attribs
- // @todo support multiple streams
- const uint32_t vertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
- uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
- *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
-
- const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
- const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
- const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
- const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
-
- // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
- // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
-
- // allocate space for temporary per-stream cut buffer if multi-stream is enabled
+
+ const SWR_GS_STATE& gsState = state.gsState;
+
+ // Allocate storage for vertex inputs
+ uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
+ pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
+
+ // Allocate arena space to hold GS output verts
+ const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
+
+ for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+ {
+ pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
+ }
+
+ // Allocate storage for transposed GS output
+ uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
+ uint32_t transposedBufferSize =
+ numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
+ pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
+
+ // Allocate storage to hold temporary stream->cut buffer, if necessary
if (state.gsState.isSingleStream)
{
- *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
- *ppStreamCutBuffer = nullptr;
+ pGsBuffers->pStreamCutBuffer = nullptr;
}
else
{
- *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
- *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+ pGsBuffers->pStreamCutBuffer =
+ (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
}
-
}
//////////////////////////////////////////////////////////////////////////
struct TessellationThreadLocalData
{
SWR_HS_CONTEXT hsContext;
- ScalarPatch patchData[KNOB_SIMD_WIDTH];
- void* pTxCtx;
- size_t tsCtxSize;
+ void* pTxCtx;
+ size_t tsCtxSize;
+
+ uint8_t* pHSOutput;
+ size_t hsOutputAllocSize;
simdscalar* pDSOutput;
- size_t numDSOutputVectors;
+ size_t dsOutputAllocSize;
};
THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
/// @TODO - Don't use thread local storage. Use Worker local storage instead.
if (gt_pTessellationThreadData == nullptr)
{
- gt_pTessellationThreadData = (TessellationThreadLocalData*)
- AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
+ gt_pTessellationThreadData =
+ (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
}
}
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pa - The primitive assembly object.
/// @param pGsOut - output stream for GS
-template <
- typename HasGeometryShaderT,
- typename HasStreamOutT,
- typename HasRastT>
-static void TessellationStages(
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- PA_STATE& pa,
- void* pGsOut,
- void* pCutBuffer,
- void* pCutStreamBuffer,
- uint32_t* pSoPrimData,
- simdscalari primID)
+template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
+static void TessellationStages(DRAW_CONTEXT* pDC,
+ uint32_t workerId,
+ PA_STATE& pa,
+ GsBuffers* pGsBuffers,
+ uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims_simd8,
+#endif
+ simdscalari const& primID)
{
- const API_STATE& state = GetApiState(pDC);
+ const API_STATE& state = GetApiState(pDC);
const SWR_TS_STATE& tsState = state.tsState;
- SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
+ void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
SWR_ASSERT(gt_pTessellationThreadData);
- HANDLE tsCtx = TSInitCtx(
- tsState.domain,
- tsState.partitioning,
- tsState.tsOutputTopology,
- gt_pTessellationThreadData->pTxCtx,
- gt_pTessellationThreadData->tsCtxSize);
+ HANDLE tsCtx = TSInitCtx(tsState.domain,
+ tsState.partitioning,
+ tsState.tsOutputTopology,
+ gt_pTessellationThreadData->pTxCtx,
+ gt_pTessellationThreadData->tsCtxSize);
if (tsCtx == nullptr)
{
- gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
- tsCtx = TSInitCtx(
- tsState.domain,
- tsState.partitioning,
- tsState.tsOutputTopology,
- gt_pTessellationThreadData->pTxCtx,
- gt_pTessellationThreadData->tsCtxSize);
+ gt_pTessellationThreadData->pTxCtx =
+ AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
+ tsCtx = TSInitCtx(tsState.domain,
+ tsState.partitioning,
+ tsState.tsOutputTopology,
+ gt_pTessellationThreadData->pTxCtx,
+ gt_pTessellationThreadData->tsCtxSize);
}
SWR_ASSERT(tsCtx);
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (tsState.postDSTopology)
+ {
+ case TOP_TRIANGLE_LIST:
+ pfnClipFunc = ClipTriangles_simd16;
+ break;
+ case TOP_LINE_LIST:
+ pfnClipFunc = ClipLines_simd16;
+ break;
+ case TOP_POINT_LIST:
+ pfnClipFunc = ClipPoints_simd16;
+ break;
+ default:
+ SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+ }
+ }
+
+#else
PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
switch (tsState.postDSTopology)
{
- case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
- case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
- case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
- default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
+ case TOP_TRIANGLE_LIST:
+ pfnClipFunc = ClipTriangles;
+ break;
+ case TOP_LINE_LIST:
+ pfnClipFunc = ClipLines;
+ break;
+ case TOP_POINT_LIST:
+ pfnClipFunc = ClipPoints;
+ break;
+ default:
+ SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
}
}
- SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
- hsContext.pCPout = gt_pTessellationThreadData->patchData;
- hsContext.PrimitiveID = primID;
+#endif
+ SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
+ hsContext.PrimitiveID = primID;
+ hsContext.outputSize = tsState.hsAllocationSize;
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
// Max storage for one attribute for an entire simdprimitive
// assemble all attributes for the input primitives
for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+ uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
pa.Assemble(attribSlot, simdattrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
+ hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
}
}
+ // Allocate HS output storage
+ uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
+
+ if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
+ {
+ AlignedFree(gt_pTessellationThreadData->pHSOutput);
+ gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
+ gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
+ }
+
+ hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
+
#if defined(_DEBUG)
- memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
+ //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
#endif
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims = numPrims_simd8;
+#else
uint32_t numPrims = pa.NumPrims();
+#endif
hsContext.mask = GenerateMask(numPrims);
// Run the HS
- RDTSC_START(FEHullShader);
- state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
- RDTSC_STOP(FEHullShader, 0, 0);
+ RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEHullShader, pDC->drawId);
+ state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
+ RDTSC_END(pDC->pContext->pBucketMgr, FEHullShader, 0);
- UPDATE_STAT(HsInvocations, numPrims);
+ UPDATE_STAT_FE(HsInvocations, numPrims);
+ AR_EVENT(HSStats((HANDLE)&hsContext.stats));
const uint32_t* pPrimId = (const uint32_t*)&primID;
for (uint32_t p = 0; p < numPrims; ++p)
{
+ ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
+
+ SWR_TESSELLATION_FACTORS tessFactors;
+ tessFactors = hsContext.pCPout[p].tessFactors;
+
// Run Tessellator
- SWR_TS_TESSELLATED_DATA tsData = { 0 };
- RDTSC_START(FETessellation);
- TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
- RDTSC_STOP(FETessellation, 0, 0);
+ SWR_TS_TESSELLATED_DATA tsData = {0};
+ RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
+ TSTessellate(tsCtx, tessFactors, tsData);
+ AR_EVENT(TessPrimCount(1));
+ RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
if (tsData.NumPrimitives == 0)
{
SWR_ASSERT(tsData.NumDomainPoints);
// Allocate DS Output memory
- uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
- size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
- size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
- if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
+ uint32_t requiredDSVectorInvocations =
+ AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
+#if USE_SIMD16_FRONTEND
+ size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
+ tsState.dsAllocationSize; // simd8 -> simd16, padding
+#else
+ size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
+ size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+#endif
+ if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
{
AlignedFree(gt_pTessellationThreadData->pDSOutput);
- gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
- gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
+ gt_pTessellationThreadData->pDSOutput =
+ (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
+ gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
}
SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
- SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
+ SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);
#if defined(_DEBUG)
memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
// Run Domain Shader
SWR_DS_CONTEXT dsContext;
- dsContext.PrimitiveID = pPrimId[p];
- dsContext.pCpIn = &hsContext.pCPout[p];
- dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
- dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
- dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
- dsContext.vectorStride = requiredDSVectorInvocations;
+ dsContext.PrimitiveID = pPrimId[p];
+ dsContext.pCpIn = pCPout;
+ dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
+ dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
+ dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+ dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
+#if USE_SIMD16_FRONTEND
+ dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
+#else
+ dsContext.vectorStride = requiredDSVectorInvocations;
+#endif
uint32_t dsInvocations = 0;
- for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
+ for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
+ ++dsContext.vectorOffset)
{
dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
- RDTSC_START(FEDomainShader);
- state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
- RDTSC_STOP(FEDomainShader, 0, 0);
+ RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEDomainShader, pDC->drawId);
+ state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
+ RDTSC_END(pDC->pContext->pBucketMgr, FEDomainShader, 0);
+
+ AR_EVENT(DSStats((HANDLE)&dsContext.stats));
dsInvocations += KNOB_SIMD_WIDTH;
}
- UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
+ UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
+#if USE_SIMD16_FRONTEND
+ SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
+
+#endif
PA_TESS tessPa(
pDC,
+#if USE_SIMD16_FRONTEND
+ reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
+ dsContext.vectorStride / 2, // simd8 -> simd16
+#else
dsContext.pOutputData,
dsContext.vectorStride,
- tsState.numDsOutputAttribs,
+#endif
+ SWR_VTX_NUM_SLOTS,
+ tsState.numDsOutputAttribs + tsState.dsOutVtxAttribOffset,
tsData.ppIndices,
tsData.NumPrimitives,
- tsState.postDSTopology);
+ tsState.postDSTopology,
+ NumVertsPerPrim(tsState.postDSTopology, false));
while (tessPa.HasWork())
{
+#if USE_SIMD16_FRONTEND
+ const uint32_t numPrims = tessPa.NumPrims();
+ const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+ const uint32_t numPrims_hi =
+ std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+ const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
+ const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+ const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+
+#endif
if (HasGeometryShaderT::value)
{
+#if USE_SIMD16_FRONTEND
+ tessPa.useAlternateOffset = false;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(
+ pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
+
+ if (numPrims_hi)
+ {
+ tessPa.useAlternateOffset = true;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(
+ pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
+ }
+#else
GeometryShaderStage<HasStreamOutT, HasRastT>(
- pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
+ pDC,
+ workerId,
+ tessPa,
+ pGsBuffers,
+ pSoPrimData,
_simd_set1_epi32(dsContext.PrimitiveID));
+#endif
}
else
{
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
+ tessPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
}
if (HasRastT::value)
{
+#if USE_SIMD16_FRONTEND
+ simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
+#else
simdvector prim[3]; // Only deal with triangles, lines, or points
- RDTSC_START(FEPAAssemble);
-#if SWR_ENABLE_ASSERTS
- bool assemble =
#endif
+ RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
+ bool assemble =
+#if USE_SIMD16_FRONTEND
+ tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
+#else
tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
- RDTSC_STOP(FEPAAssemble, 1, 0);
+#endif
+ RDTSC_END(pDC->pContext->pBucketMgr, FEPAAssemble, 1);
SWR_ASSERT(assemble);
SWR_ASSERT(pfnClipFunc);
- pfnClipFunc(pDC, tessPa, workerId, prim,
- GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
- }
- }
+#if USE_SIMD16_FRONTEND
+ // Gather data from the SVG if provided.
+ simd16scalari vViewportIdx = SIMD16::setzero_si();
+ simd16scalari vRtIdx = SIMD16::setzero_si();
+ SIMD16::Vec4 svgAttrib[4];
+
+ if (state.backendState.readViewportArrayIndex ||
+ state.backendState.readRenderTargetArrayIndex)
+ {
+ tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+ }
- tessPa.NextPrim();
+ if (state.backendState.readViewportArrayIndex)
+ {
+ vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+ tessPa.viewportArrayActive = true;
+ }
+ if (state.backendState.readRenderTargetArrayIndex)
+ {
+ vRtIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+ tessPa.rtArrayActive = true;
+ }
- } // while (tessPa.HasWork())
- } // for (uint32_t p = 0; p < numPrims; ++p)
- TSDestroyCtx(tsCtx);
+ {
+ // OOB VPAI indices => forced to zero.
+ vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
+ simd16scalari vNumViewports =
+ SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
+ vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
+
+ tessPa.useAlternateOffset = false;
+ pfnClipFunc(pDC,
+ tessPa,
+ workerId,
+ prim_simd16,
+ GenMask(numPrims),
+ primID,
+ vViewportIdx,
+ vRtIdx);
+ }
+#else
+ // Gather data from the SGV if provided.
+ simdscalari vViewportIdx = SIMD::setzero_si();
+ simdscalari vRtIdx = SIMD::setzero_si();
+ SIMD::Vec4 svgAttrib[4];
+
+ if (state.backendState.readViewportArrayIndex ||
+ state.backendState.readRenderTargetArrayIndex)
+ {
+ tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+ }
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+ // OOB VPAI indices => forced to zero.
+ vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+ simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+ vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+ tessPa.viewportArrayActive = true;
+ }
+ if (state.backendState.readRenderTargetArrayIndex)
+ {
+ vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+ tessPa.rtArrayActive = true;
+ }
+ pfnClipFunc(pDC,
+ tessPa,
+ workerId,
+ prim,
+ GenMask(tessPa.NumPrims()),
+ _simd_set1_epi32(dsContext.PrimitiveID),
+ vViewportIdx,
+ vRtIdx);
+#endif
+ }
+ }
+
+ tessPa.NextPrim();
+
+ } // while (tessPa.HasWork())
+ } // for (uint32_t p = 0; p < numPrims; ++p)
+
+#if USE_SIMD16_FRONTEND
+ if (gt_pTessellationThreadData->pDSOutput != nullptr)
+ {
+ AlignedFree(gt_pTessellationThreadData->pDSOutput);
+ gt_pTessellationThreadData->pDSOutput = nullptr;
+ }
+ gt_pTessellationThreadData->dsOutputAllocSize = 0;
+
+#endif
+ TSDestroyCtx(tsCtx);
}
+THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
+THREAD uint32_t gVertexStoreSize = 0;
+
//////////////////////////////////////////////////////////////////////////
/// @brief FE handler for SwrDraw.
/// @tparam IsIndexedT - Is indexed drawing enabled
/// @param pDC - pointer to draw context.
/// @param workerId - thread's worker id.
/// @param pUserData - Pointer to DRAW_WORK
-template <
- typename IsIndexedT,
- typename IsCutIndexEnabledT,
- typename HasTessellationT,
- typename HasGeometryShaderT,
- typename HasStreamOutT,
- typename HasRastT>
-void ProcessDraw(
- SWR_CONTEXT *pContext,
- DRAW_CONTEXT *pDC,
- uint32_t workerId,
- void *pUserData)
+template <typename IsIndexedT,
+ typename IsCutIndexEnabledT,
+ typename HasTessellationT,
+ typename HasGeometryShaderT,
+ typename HasStreamOutT,
+ typename HasRastT>
+void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
{
-
#if KNOB_ENABLE_TOSS_POINTS
if (KNOB_TOSS_QUEUE_FE)
{
}
#endif
- RDTSC_START(FEProcessDraw);
+ RDTSC_BEGIN(pContext->pBucketMgr, FEProcessDraw, pDC->drawId);
- DRAW_WORK& work = *(DRAW_WORK*)pUserData;
- const API_STATE& state = GetApiState(pDC);
- __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- SWR_VS_CONTEXT vsContext;
- simdvertex vin;
+ void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
- int indexSize = 0;
- uint32_t endVertex = work.numVerts;
+ DRAW_WORK& work = *(DRAW_WORK*)pUserData;
+ const API_STATE& state = GetApiState(pDC);
+
+ uint32_t indexSize = 0;
+ uint32_t endVertex = work.numVerts;
- const int32_t* pLastRequestedIndex = nullptr;
+ gfxptr_t xpLastRequestedIndex = 0;
if (IsIndexedT::value)
{
switch (work.type)
{
case R32_UINT:
indexSize = sizeof(uint32_t);
- pLastRequestedIndex = &(work.pIB[endVertex]);
break;
case R16_UINT:
indexSize = sizeof(uint16_t);
- // nasty address offset to last index
- pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
break;
case R8_UINT:
indexSize = sizeof(uint8_t);
- // nasty address offset to last index
- pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
break;
default:
- SWR_ASSERT(0);
+ SWR_INVALID("Invalid work.type: %d", work.type);
}
+ xpLastRequestedIndex = work.xpIB + endVertex * indexSize;
}
else
{
endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
}
- SWR_FETCH_CONTEXT fetchInfo = { 0 };
- fetchInfo.pStreams = &state.vertexBuffers[0];
- fetchInfo.StartInstance = work.startInstance;
- fetchInfo.StartVertex = 0;
-
- vsContext.pVin = &vin;
-
- if (IsIndexedT::value)
- {
- fetchInfo.BaseVertex = work.baseVertex;
-
- // if the entire index buffer isn't being consumed, set the last index
- // so that fetches < a SIMD wide will be masked off
- fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
- if (pLastRequestedIndex < fetchInfo.pLastIndex)
- {
- fetchInfo.pLastIndex = pLastRequestedIndex;
- }
- }
- else
- {
- fetchInfo.StartVertex = work.startVertex;
- }
-
-#ifdef KNOB_ENABLE_RDTSC
+#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
#endif
- void* pGsOut = nullptr;
- void* pCutBuffer = nullptr;
- void* pStreamCutBuffer = nullptr;
+ GsBuffers gsBuffers;
if (HasGeometryShaderT::value)
{
- AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#if USE_SIMD16_FRONTEND
+ AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
+ pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
+#else
+ AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
+ pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
+#endif
}
if (HasTessellationT::value)
pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
}
+ const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
+#if USE_SIMD16_FRONTEND
+ uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
+#else
+ uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
+#endif
+
+ SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
+
+ // Compute storage requirements for vertex store
+ // TODO: allocation needs to be rethought for better cut support
+ uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
+ uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
+
+ // grow the vertex store for the PA as necessary
+ if (gVertexStoreSize < vertexStoreSize)
+ {
+ if (gpVertexStore != nullptr)
+ {
+ AlignedFree(gpVertexStore);
+ gpVertexStore = nullptr;
+ }
+
+ SWR_ASSERT(gpVertexStore == nullptr);
+
+ gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
+ gVertexStoreSize = vertexStoreSize;
+
+ SWR_ASSERT(gpVertexStore != nullptr);
+ }
+
// choose primitive assembler
- PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
- PA_STATE& pa = paFactory.GetPA();
- /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+ PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
+ state.topology,
+ work.numVerts,
+ gpVertexStore,
+ numVerts,
+ state.frontendState.vsVertexSize,
+ GetNumVerts(state.topology, 1));
+ PA_STATE& pa = paFactory.GetPA();
+
+#if USE_SIMD16_FRONTEND
+#if USE_SIMD16_SHADERS
+ simd16vertex vin;
+#else
+ simdvertex vin_lo;
+ simdvertex vin_hi;
+#endif
+ SWR_VS_CONTEXT vsContext_lo;
+ SWR_VS_CONTEXT vsContext_hi;
+
+#if USE_SIMD16_SHADERS
+ vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
+ vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
+#else
+ vsContext_lo.pVin = &vin_lo;
+ vsContext_hi.pVin = &vin_hi;
+#endif
+ vsContext_lo.AlternateOffset = 0;
+ vsContext_hi.AlternateOffset = 1;
+
+ SWR_FETCH_CONTEXT fetchInfo_lo = {0};
+
+ fetchInfo_lo.pStreams = &state.vertexBuffers[0];
+ fetchInfo_lo.StartInstance = work.startInstance;
+ fetchInfo_lo.StartVertex = 0;
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo_lo.BaseVertex = work.baseVertex;
+
+ // if the entire index buffer isn't being consumed, set the last index
+ // so that fetches < a SIMD wide will be masked off
+ fetchInfo_lo.xpLastIndex = state.indexBuffer.xpIndices + state.indexBuffer.size;
+ if (xpLastRequestedIndex < fetchInfo_lo.xpLastIndex)
+ {
+ fetchInfo_lo.xpLastIndex = xpLastRequestedIndex;
+ }
+ }
+ else
+ {
+ fetchInfo_lo.StartVertex = work.startVertex;
+ }
+
+ SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
+
+ const simd16scalari vScale =
+ _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
{
- simdscalari vIndex;
- uint32_t i = 0;
+ uint32_t i = 0;
+
+ simd16scalari vIndex;
if (IsIndexedT::value)
{
- fetchInfo.pIndices = work.pIB;
+ fetchInfo_lo.xpIndices = work.xpIB;
+ fetchInfo_hi.xpIndices =
+ fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
}
else
{
- vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
- fetchInfo.pIndices = (const int32_t*)&vIndex;
+ vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
+
+ fetchInfo_lo.xpIndices = pDC->pContext->pfnMakeGfxPtr(GetPrivateState(pDC), &vIndex);
+ fetchInfo_hi.xpIndices = pDC->pContext->pfnMakeGfxPtr(
+ GetPrivateState(pDC),
+ &vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
}
- fetchInfo.CurInstance = instanceNum;
- vsContext.InstanceID = instanceNum;
+ fetchInfo_lo.CurInstance = instanceNum;
+ fetchInfo_hi.CurInstance = instanceNum;
+
+ vsContext_lo.InstanceID = instanceNum;
+ vsContext_hi.InstanceID = instanceNum;
while (pa.HasWork())
{
- // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
- // So we need to keep this outside of (i < endVertex) check.
- simdmask* pvCutIndices = nullptr;
+ // GetNextVsOutput currently has the side effect of updating some PA state machine
+ // state. So we need to keep this outside of (i < endVertex) check.
+
+ simdmask* pvCutIndices_lo = nullptr;
+ simdmask* pvCutIndices_hi = nullptr;
+
if (IsIndexedT::value)
{
- pvCutIndices = &pa.GetNextVsIndices();
+ // simd16mask <=> simdmask[2]
+
+ pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
+ pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
}
- simdvertex& vout = pa.GetNextVsOutput();
- vsContext.pVout = &vout;
+ simd16vertex& vout = pa.GetNextVsOutput();
+
+ vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
+ vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
if (i < endVertex)
{
-
+ if (!IsIndexedT::value)
+ {
+ fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
+ uint32_t offset;
+ offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
+ offset *= 4; // convert from index to address
+#if USE_SIMD16_SHADERS
+ fetchInfo_lo.xpLastIndex += offset;
+#else
+ fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
+ uint32_t offset2 =
+ std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
+ assert(offset >= 0);
+ fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
+ fetchInfo_hi.xpLastIndex += offset2;
+#endif
+ }
// 1. Execute FS/VS for a single SIMD.
- RDTSC_START(FEFetchShader);
- state.pfnFetchFunc(fetchInfo, vin);
- RDTSC_STOP(FEFetchShader, 0, 0);
+ RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
+#if USE_SIMD16_SHADERS
+ state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
+#else
+ state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
+
+ if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
+ {
+ state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
+ }
+#endif
+ RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
// forward fetch generated vertex IDs to the vertex shader
- vsContext.VertexID = fetchInfo.VertexID;
+#if USE_SIMD16_SHADERS
+#if USE_SIMD16_VS
+ vsContext_lo.VertexID16 =
+ _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
+ vsContext_lo.VertexID16 =
+ _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
+#else
+ vsContext_lo.VertexID = fetchInfo_lo.VertexID;
+ vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
+#endif
+#else
+ vsContext_lo.VertexID = fetchInfo_lo.VertexID;
+ vsContext_hi.VertexID = fetchInfo_hi.VertexID;
+#endif
// Setup active mask for vertex shader.
- vsContext.mask = GenerateMask(endVertex - i);
+#if USE_SIMD16_VS
+ vsContext_lo.mask16 = GenerateMask16(endVertex - i);
+#else
+ vsContext_lo.mask = GenerateMask(endVertex - i);
+ vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
+#endif
// forward cut mask to the PA
if (IsIndexedT::value)
{
- *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
+#if USE_SIMD16_SHADERS
+ *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
+ *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
+#else
+ *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
+ *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
+#endif
}
- UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
+ UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
#if KNOB_ENABLE_TOSS_POINTS
if (!KNOB_TOSS_FETCH)
#endif
{
- RDTSC_START(FEVertexShader);
- state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
- RDTSC_STOP(FEVertexShader, 0, 0);
+ RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
+#if USE_SIMD16_VS
+ state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
+ AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
+#else
+ state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
+ AR_EVENT(VSStats((HANDLE)&vsContext_lo.stats));
+
+ if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
+ {
+ state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
+ AR_EVENT(VSStats((HANDLE)&vsContext_hi.stats));
+ }
+#endif
+ RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
- UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
+ UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
}
}
// 2. Assemble primitives given the last two SIMD.
do
{
- simdvector prim[MAX_NUM_VERTS_PER_PRIM];
- // PaAssemble returns false if there is not enough verts to assemble.
- RDTSC_START(FEPAAssemble);
- bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
- RDTSC_STOP(FEPAAssemble, 1, 0);
+ simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
+
+ RDTSC_START(pContext->pBucketMgr, FEPAAssemble);
+ bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
+ RDTSC_STOP(pContext->pBucketMgr, FEPAAssemble, 1, 0);
#if KNOB_ENABLE_TOSS_POINTS
if (!KNOB_TOSS_FETCH)
{
if (assemble)
{
- UPDATE_STAT(IaPrimitives, pa.NumPrims());
+ UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
+
+ const uint32_t numPrims = pa.NumPrims();
+ const uint32_t numPrims_lo =
+ std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+ const uint32_t numPrims_hi =
+ std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+ const simd16scalari primID = pa.GetPrimID(work.startPrimID);
+ const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+ const simdscalari primID_hi = _simd16_extract_si(primID, 1);
if (HasTessellationT::value)
{
+ pa.useAlternateOffset = false;
TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ pDC,
+ workerId,
+ pa,
+ &gsBuffers,
+ pSoPrimData,
+ numPrims_lo,
+ primID_lo);
+
+ if (numPrims_hi)
+ {
+ pa.useAlternateOffset = true;
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+ pDC,
+ workerId,
+ pa,
+ &gsBuffers,
+ pSoPrimData,
+ numPrims_hi,
+ primID_hi);
+ }
}
else if (HasGeometryShaderT::value)
{
- GeometryShaderStage<HasStreamOutT, HasRastT>(
- pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+ pa.useAlternateOffset = false;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
+ workerId,
+ pa,
+ &gsBuffers,
+ pSoPrimData,
+ numPrims_lo,
+ primID_lo);
+
+ if (numPrims_hi)
+ {
+ pa.useAlternateOffset = true;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
+ workerId,
+ pa,
+ &gsBuffers,
+ pSoPrimData,
+ numPrims_hi,
+ primID_hi);
+ }
}
else
{
// If streamout is enabled then stream vertices out to memory.
if (HasStreamOutT::value)
{
+ pa.useAlternateOffset = false;
StreamOut(pDC, pa, workerId, pSoPrimData, 0);
}
if (HasRastT::value)
{
- SWR_ASSERT(pDC->pState->pfnProcessPrims);
- pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
- GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
+ SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
+ // Gather data from the SVG if provided.
+ simd16scalari vpai = SIMD16::setzero_si();
+ simd16scalari rtai = SIMD16::setzero_si();
+ SIMD16::Vec4 svgAttrib[4];
+
+ if (state.backendState.readViewportArrayIndex ||
+ state.backendState.readRenderTargetArrayIndex)
+ {
+ pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+ }
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+ pa.viewportArrayActive = true;
+ }
+ if (state.backendState.readRenderTargetArrayIndex)
+ {
+ rtai =
+ SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+ pa.rtArrayActive = true;
+ }
+
+ {
+ // OOB VPAI indices => forced to zero.
+ vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
+ simd16scalari vNumViewports =
+ SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simd16scalari vClearMask =
+ SIMD16::cmplt_epi32(vpai, vNumViewports);
+ vpai = SIMD16::and_si(vClearMask, vpai);
+
+ pa.useAlternateOffset = false;
+ pDC->pState->pfnProcessPrims_simd16(pDC,
+ pa,
+ workerId,
+ prim_simd16,
+ GenMask(numPrims),
+ primID,
+ vpai,
+ rtai);
+ }
}
}
}
}
} while (pa.NextPrim());
- i += KNOB_SIMD_WIDTH;
if (IsIndexedT::value)
{
- fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+ fetchInfo_lo.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
+ fetchInfo_hi.xpIndices = fetchInfo_hi.xpIndices + KNOB_SIMD16_WIDTH * indexSize;
}
else
{
- vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+ vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
}
+
+ i += KNOB_SIMD16_WIDTH;
}
+
pa.Reset();
}
- RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
-}
+#else
+ SWR_VS_CONTEXT vsContext;
+ SWR_FETCH_CONTEXT fetchInfo = {0};
-struct FEDrawChooser
-{
- typedef PFN_FE_WORK_FUNC FuncType;
+ fetchInfo.pStreams = &state.vertexBuffers[0];
+ fetchInfo.StartInstance = work.startInstance;
+ fetchInfo.StartVertex = 0;
- template <typename... ArgsB>
- static FuncType GetFunc()
+ if (IsIndexedT::value)
{
- return ProcessDraw<ArgsB...>;
- }
-};
+ fetchInfo.BaseVertex = work.baseVertex;
+ // if the entire index buffer isn't being consumed, set the last index
+ // so that fetches < a SIMD wide will be masked off
+ fetchInfo.pLastIndex =
+ (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ if (xpLastRequestedIndex < fetchInfo.pLastIndex)
+ {
+ fetchInfo.pLastIndex = xpLastRequestedIndex;
+ }
+ }
+ else
+ {
+ fetchInfo.StartVertex = work.startVertex;
+ }
-// Selector for correct templated Draw front-end function
-PFN_FE_WORK_FUNC GetProcessDrawFunc(
- bool IsIndexed,
- bool IsCutIndexEnabled,
- bool HasTessellation,
- bool HasGeometryShader,
- bool HasStreamOut,
- bool HasRasterization)
-{
- return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
-}
+ const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes attributes for the backend based on linkage mask and
-/// linkage map. Essentially just doing an SOA->AOS conversion and pack.
-/// @param pDC - Draw context
-/// @param pa - Primitive Assembly state
-/// @param linkageMask - Specifies which VS outputs are routed to PS.
-/// @param pLinkageMap - maps VS attribute slot to PS slot
-/// @param triIndex - Triangle to process attributes for
-/// @param pBuffer - Output result
-template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
-INLINE void ProcessAttributes(
- DRAW_CONTEXT *pDC,
- PA_STATE&pa,
- uint32_t triIndex,
- uint32_t primId,
- float *pBuffer)
-{
- static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
- const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
- // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
- LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
- const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
- const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
-
- static const float constTable[3][4] = {
- {0.0f, 0.0f, 0.0f, 0.0f},
- {0.0f, 0.0f, 0.0f, 1.0f},
- {1.0f, 1.0f, 1.0f, 1.0f}
- };
-
- for (uint32_t i = 0; i < backendState.numAttributes; ++i)
+ /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+ for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
{
- uint32_t inputSlot;
- if (IsSwizzledT::value)
- {
- SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
- inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
+ simdscalari vIndex;
+ uint32_t i = 0;
+ if (IsIndexedT::value)
+ {
+ fetchInfo.pIndices = work.pIB;
}
else
{
- inputSlot = VERTEX_ATTRIB_START_SLOT + i;
+ vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
+ fetchInfo.pIndices = (const int32_t*)&vIndex;
}
- __m128 attrib[3]; // triangle attribs (always 4 wide)
- float* pAttribStart = pBuffer;
+ fetchInfo.CurInstance = instanceNum;
+ vsContext.InstanceID = instanceNum;
- if (HasConstantInterpT::value || IsDegenerate::value)
+ while (pa.HasWork())
{
- if (_bittest(&constantInterpMask, i))
+ // GetNextVsOutput currently has the side effect of updating some PA state machine
+ // state. So we need to keep this outside of (i < endVertex) check.
+ simdmask* pvCutIndices = nullptr;
+ if (IsIndexedT::value)
{
- uint32_t vid;
- uint32_t adjustedTriIndex;
- static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
- static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
- static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
- static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
- static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
-
- switch (topo) {
- case TOP_QUAD_LIST:
- adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
- vid = quadProvokingVertex[triIndex & 1][provokingVertex];
- break;
- case TOP_QUAD_STRIP:
- adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
- vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
- break;
- case TOP_TRIANGLE_STRIP:
- adjustedTriIndex = triIndex;
- vid = (triIndex & 1)
- ? tristripProvokingVertex[provokingVertex]
- : provokingVertex;
- break;
- default:
- adjustedTriIndex = triIndex;
- vid = provokingVertex;
- break;
- }
+ pvCutIndices = &pa.GetNextVsIndices();
+ }
+
+ simdvertex& vout = pa.GetNextVsOutput();
+ vsContext.pVin = &vout;
+ vsContext.pVout = &vout;
+
+ if (i < endVertex)
+ {
+ // 1. Execute FS/VS for a single SIMD.
+ RDTSC_BEGIN(pContext->pBucketMgr, FEFetchShader, pDC->drawId);
+ state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
+ RDTSC_END(pContext->pBucketMgr, FEFetchShader, 0);
+
+ // forward fetch generated vertex IDs to the vertex shader
+ vsContext.VertexID = fetchInfo.VertexID;
- pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
+ // Setup active mask for vertex shader.
+ vsContext.mask = GenerateMask(endVertex - i);
- for (uint32_t i = 0; i < NumVertsT::value; ++i)
+ // forward cut mask to the PA
+ if (IsIndexedT::value)
{
- _mm_store_ps(pBuffer, attrib[vid]);
- pBuffer += 4;
+ *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
}
- }
- else
- {
- pa.AssembleSingle(inputSlot, triIndex, attrib);
- for (uint32_t i = 0; i < NumVertsT::value; ++i)
+ UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
{
- _mm_store_ps(pBuffer, attrib[i]);
- pBuffer += 4;
+ RDTSC_BEGIN(pContext->pBucketMgr, FEVertexShader, pDC->drawId);
+ state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
+ RDTSC_END(pContext->pBucketMgr, FEVertexShader, 0);
+
+ UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
+ AR_EVENT(VSStats((HANDLE)&vsContext.stats));
}
}
- }
- else
- {
- pa.AssembleSingle(inputSlot, triIndex, attrib);
- for (uint32_t i = 0; i < NumVertsT::value; ++i)
+ // 2. Assemble primitives given the last two SIMD.
+ do
{
- _mm_store_ps(pBuffer, attrib[i]);
- pBuffer += 4;
- }
- }
-
- // pad out the attrib buffer to 3 verts to ensure the triangle
- // interpolation code in the pixel shader works correctly for the
- // 3 topologies - point, line, tri. This effectively zeros out the
- // effect of the missing vertices in the triangle interpolation.
- for (uint32_t v = NumVertsT::value; v < 3; ++v)
- {
- _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
- pBuffer += 4;
- }
+ simdvector prim[MAX_NUM_VERTS_PER_PRIM];
+ // PaAssemble returns false if there is not enough verts to assemble.
+ RDTSC_BEGIN(pContext->pBucketMgr, FEPAAssemble, pDC->drawId);
+ bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
+ RDTSC_END(pContext->pBucketMgr, FEPAAssemble, 1);
- // check for constant source overrides
- if (IsSwizzledT::value)
- {
- uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
- if (mask)
- {
- DWORD comp;
- while (_BitScanForward(&comp, mask))
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
{
- mask &= ~(1 << comp);
-
- float constantValue = 0.0f;
- switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_VS)
+#endif
{
- case SWR_CONSTANT_SOURCE_CONST_0000:
- case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
- case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
- constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
- break;
- case SWR_CONSTANT_SOURCE_PRIM_ID:
- constantValue = *(float*)&primId;
- break;
- }
+ if (assemble)
+ {
+ UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
- // apply constant value to all 3 vertices
- for (uint32_t v = 0; v < 3; ++v)
- {
- pAttribStart[comp + v * 4] = constantValue;
+ if (HasTessellationT::value)
+ {
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+ pDC,
+ workerId,
+ pa,
+ &gsBuffers,
+ pSoPrimData,
+ pa.GetPrimID(work.startPrimID));
+ }
+ else if (HasGeometryShaderT::value)
+ {
+ GeometryShaderStage<HasStreamOutT, HasRastT>(
+ pDC,
+ workerId,
+ pa,
+ &gsBuffers,
+ pSoPrimData,
+ pa.GetPrimID(work.startPrimID));
+ }
+ else
+ {
+ // If streamout is enabled then stream vertices out to memory.
+ if (HasStreamOutT::value)
+ {
+ StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+ }
+
+ if (HasRastT::value)
+ {
+ SWR_ASSERT(pDC->pState->pfnProcessPrims);
+
+ // Gather data from the SVG if provided.
+ simdscalari vViewportIdx = SIMD::setzero_si();
+ simdscalari vRtIdx = SIMD::setzero_si();
+ SIMD::Vec4 svgAttrib[4];
+
+ if (state.backendState.readViewportArrayIndex ||
+ state.backendState.readRenderTargetArrayIndex)
+ {
+ pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
+ }
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ vViewportIdx =
+ SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+
+ // OOB VPAI indices => forced to zero.
+ vViewportIdx =
+ SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+ simdscalari vNumViewports =
+ SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simdscalari vClearMask =
+ SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+ vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+ pa.viewportArrayActive = true;
+ }
+ if (state.backendState.readRenderTargetArrayIndex)
+ {
+ vRtIdx =
+ SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+ pa.rtArrayActive = true;
+ }
+
+ pDC->pState->pfnProcessPrims(pDC,
+ pa,
+ workerId,
+ prim,
+ GenMask(pa.NumPrims()),
+ pa.GetPrimID(work.startPrimID),
+ vViewportIdx,
+ vRtIdx);
+ }
+ }
+ }
}
}
+ } while (pa.NextPrim());
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo.pIndices =
+ (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
}
+ else
+ {
+ vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+ }
+
+ i += KNOB_SIMD_WIDTH;
}
+ pa.Reset();
}
-}
+#endif
-typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
+ RDTSC_END(pContext->pBucketMgr, FEProcessDraw, numPrims * work.numInstances);
+}
-struct ProcessAttributesChooser
+struct FEDrawChooser
{
- typedef PFN_PROCESS_ATTRIBUTES FuncType;
+ typedef PFN_FE_WORK_FUNC FuncType;
template <typename... ArgsB>
static FuncType GetFunc()
{
- return ProcessAttributes<ArgsB...>;
+ return ProcessDraw<ArgsB...>;
}
};
-PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
+// Selector for correct templated Draw front-end function
+PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
+ bool IsCutIndexEnabled,
+ bool HasTessellation,
+ bool HasGeometryShader,
+ bool HasStreamOut,
+ bool HasRasterization)
{
- return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Processes enabled user clip distances. Loads the active clip
-/// distances from the PA, sets up barycentric equations, and
-/// stores the results to the output buffer
-/// @param pa - Primitive Assembly state
-/// @param primIndex - primitive index to process
-/// @param clipDistMask - mask of enabled clip distances
-/// @param pUserClipBuffer - buffer to store results
-template<uint32_t NumVerts>
-void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
-{
- DWORD clipDist;
- while (_BitScanForward(&clipDist, clipDistMask))
- {
- clipDistMask &= ~(1 << clipDist);
- uint32_t clipSlot = clipDist >> 2;
- uint32_t clipComp = clipDist & 0x3;
- uint32_t clipAttribSlot = clipSlot == 0 ?
- VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
-
- __m128 primClipDist[3];
- pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
-
- float vertClipDist[NumVerts];
- for (uint32_t e = 0; e < NumVerts; ++e)
- {
- OSALIGNSIMD(float) aVertClipDist[4];
- _mm_store_ps(aVertClipDist, primClipDist[e]);
- vertClipDist[e] = aVertClipDist[clipComp];
- };
-
- // setup plane equations for barycentric interpolation in the backend
- float baryCoeff[NumVerts];
- for (uint32_t e = 0; e < NumVerts - 1; ++e)
- {
- baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
- }
- baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
-
- for (uint32_t e = 0; e < NumVerts; ++e)
- {
- *(pUserClipBuffer++) = baryCoeff[e];
- }
- }
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert the X,Y coords of a triangle to the requested Fixed
-/// Point precision from FP32.
-template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
-{
- simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
- return _simd_cvtps_epi32(vFixed);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper function to set the X,Y coords of a triangle to the
-/// requested Fixed Point precision from FP32.
-/// @param tri: simdvector[3] of FP triangle verts
-/// @param vXi: fixed point X coords of tri verts
-/// @param vYi: fixed point Y coords of tri verts
-INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
-{
- vXi[0] = fpToFixedPointVertical(tri[0].x);
- vYi[0] = fpToFixedPointVertical(tri[0].y);
- vXi[1] = fpToFixedPointVertical(tri[1].x);
- vYi[1] = fpToFixedPointVertical(tri[1].y);
- vXi[2] = fpToFixedPointVertical(tri[2].x);
- vYi[2] = fpToFixedPointVertical(tri[2].y);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Calculate bounding box for current triangle
-/// @tparam CT: ConservativeRastFETraits type
-/// @param vX: fixed point X position for triangle verts
-/// @param vY: fixed point Y position for triangle verts
-/// @param bbox: fixed point bbox
-/// *Note*: expects vX, vY to be in the correct precision for the type
-/// of rasterization. This avoids unnecessary FP->fixed conversions.
-template <typename CT>
-INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
-{
- simdscalari vMinX = vX[0];
- vMinX = _simd_min_epi32(vMinX, vX[1]);
- vMinX = _simd_min_epi32(vMinX, vX[2]);
-
- simdscalari vMaxX = vX[0];
- vMaxX = _simd_max_epi32(vMaxX, vX[1]);
- vMaxX = _simd_max_epi32(vMaxX, vX[2]);
-
- simdscalari vMinY = vY[0];
- vMinY = _simd_min_epi32(vMinY, vY[1]);
- vMinY = _simd_min_epi32(vMinY, vY[2]);
-
- simdscalari vMaxY = vY[0];
- vMaxY = _simd_max_epi32(vMaxY, vY[1]);
- vMaxY = _simd_max_epi32(vMaxY, vY[2]);
-
- bbox.left = vMinX;
- bbox.right = vMaxX;
- bbox.top = vMinY;
- bbox.bottom = vMaxY;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
-/// Offsets BBox for conservative rast
-template <>
-INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
-{
- // FE conservative rast traits
- typedef FEConservativeRastT CT;
-
- simdscalari vMinX = vX[0];
- vMinX = _simd_min_epi32(vMinX, vX[1]);
- vMinX = _simd_min_epi32(vMinX, vX[2]);
-
- simdscalari vMaxX = vX[0];
- vMaxX = _simd_max_epi32(vMaxX, vX[1]);
- vMaxX = _simd_max_epi32(vMaxX, vX[2]);
-
- simdscalari vMinY = vY[0];
- vMinY = _simd_min_epi32(vMinY, vY[1]);
- vMinY = _simd_min_epi32(vMinY, vY[2]);
-
- simdscalari vMaxY = vY[0];
- vMaxY = _simd_max_epi32(vMaxY, vY[1]);
- vMaxY = _simd_max_epi32(vMaxY, vY[2]);
-
- /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
- /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
- bbox.left = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.right = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.top = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
- bbox.bottom = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
-/// culling, viewport transform, etc.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains triangle position data for SIMDs worth of triangles.
-/// @param primID - Primitive ID for each triangle.
-/// @tparam CT - ConservativeRastFETraits
-template <typename CT>
-void BinTriangles(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector tri[3],
- uint32_t triMask,
- simdscalari primID)
-{
- RDTSC_START(FEBinTriangles);
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const SWR_FRONTEND_STATE& feState = state.frontendState;
- const SWR_GS_STATE& gsState = state.gsState;
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
-
- simdscalar vRecipW0 = _simd_set1_ps(1.0f);
- simdscalar vRecipW1 = _simd_set1_ps(1.0f);
- simdscalar vRecipW2 = _simd_set1_ps(1.0f);
-
- if (!feState.vpTransformDisable)
- {
- // perspective divide
- vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
- vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
- vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
-
- tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
- tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
- tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
-
- tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
- tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
- tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
-
- tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
- tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
- tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
-
- // viewport transform to screen coords
- viewportTransform<3>(tri, state.vpMatrices);
- }
-
- // adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- tri[0].x = _simd_add_ps(tri[0].x, offset);
- tri[0].y = _simd_add_ps(tri[0].y, offset);
-
- tri[1].x = _simd_add_ps(tri[1].x, offset);
- tri[1].y = _simd_add_ps(tri[1].y, offset);
-
- tri[2].x = _simd_add_ps(tri[2].x, offset);
- tri[2].y = _simd_add_ps(tri[2].y, offset);
-
- simdscalari vXi[3], vYi[3];
- // Set vXi, vYi to required fixed point precision
- FPToFixedPoint(tri, vXi, vYi);
-
- // triangle setup
- simdscalari vAi[3], vBi[3];
- triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
-
- // determinant
- simdscalari vDet[2];
- calcDeterminantIntVertical(vAi, vBi, vDet);
-
- // cull zero area
- int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
- int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
-
- int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
-
- uint32_t origTriMask = triMask;
- // don't cull degenerate triangles if we're conservatively rasterizing
- if(!CT::IsConservativeT::value)
- {
- triMask &= ~cullZeroAreaMask;
- }
-
- // determine front winding tris
- // CW +det
- // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
- maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
- maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
- int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
-
- uint32_t frontWindingTris;
- if (rastState.frontWinding == SWR_FRONTWINDING_CW)
- {
- frontWindingTris = cwTriMask;
- }
- else
- {
- frontWindingTris = ~cwTriMask;
- }
-
- // cull
- uint32_t cullTris;
- switch ((SWR_CULLMODE)rastState.cullMode)
- {
- case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
- case SWR_CULLMODE_NONE: cullTris = 0x0; break;
- case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
- // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
- case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
- default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
- }
-
- triMask &= ~cullTris;
-
- if (origTriMask ^ triMask)
- {
- RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
- }
-
- /// Note: these variable initializations must stay above any 'goto endBenTriangles'
- // compute per tri backface
- uint32_t frontFaceMask = frontWindingTris;
- uint32_t *pPrimID = (uint32_t *)&primID;
- DWORD triIndex = 0;
- // for center sample pattern, all samples are at pixel center; calculate coverage
- // once at center and broadcast the results in the backend
- const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
- uint32_t edgeEnable;
- PFN_WORK_FUNC pfnWork;
- if(CT::IsConservativeT::value)
- {
- // determine which edges of the degenerate tri, if any, are valid to rasterize.
- // used to call the appropriate templated rasterizer function
- if(cullZeroAreaMask > 0)
- {
- // e0 = v1-v0
- simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
- simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
- uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
-
- // e1 = v2-v1
- simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
- simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
- uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
-
- // e2 = v0-v2
- // if v0 == v1 & v1 == v2, v0 == v2
- uint32_t e2Mask = e0Mask & e1Mask;
- SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
-
- // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
- // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
- e0Mask = pdep_u32(e0Mask, 0x00249249);
- // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
- e1Mask = pdep_u32(e1Mask, 0x00492492);
- // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
- e2Mask = pdep_u32(e2Mask, 0x00924924);
-
- edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
- }
- else
- {
- edgeEnable = 0x00FFFFFF;
- }
- }
- else
- {
- // degenerate triangles won't be sent to rasterizer; just enable all edges
- pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
- (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
- (rastState.scissorEnable > 0));
- }
-
- if (!triMask)
- {
- goto endBinTriangles;
- }
-
- // Calc bounding box of triangles
- simdBBox bbox;
- calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
-
- // determine if triangle falls between pixel centers and discard
- // only discard for non-MSAA case and when conservative rast is disabled
- // (left + 127) & ~255
- // (right + 128) & ~255
- if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
- {
- origTriMask = triMask;
-
- int cullCenterMask;
- {
- simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
- left = _simd_and_si(left, _simd_set1_epi32(~255));
- simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
- right = _simd_and_si(right, _simd_set1_epi32(~255));
-
- simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
-
- simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
- top = _simd_and_si(top, _simd_set1_epi32(~255));
- simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
- bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
-
- simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
- vMaskV = _simd_or_si(vMaskH, vMaskV);
- cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
- }
-
- triMask &= ~cullCenterMask;
-
- if(origTriMask ^ triMask)
- {
- RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
- }
- }
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
- bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
- bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
- bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
- bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-
- if(CT::IsConservativeT::value)
- {
- // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
- // some area. Bump the right/bottom edges out
- simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.top, bbox.bottom);
- bbox.bottom = _simd_blendv_epi32(bbox.bottom, _simd_add_epi32(bbox.bottom, _simd_set1_epi32(1)), topEqualsBottom);
- simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.left, bbox.right);
- bbox.right = _simd_blendv_epi32(bbox.right, _simd_add_epi32(bbox.right, _simd_set1_epi32(1)), leftEqualsRight);
- }
-
- // Cull tris completely outside scissor
- {
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- triMask = triMask & ~maskOutsideScissor;
- }
-
- if (!triMask)
- {
- goto endBinTriangles;
- }
-
- // Convert triangle bbox to macrotile units.
- bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.left);
- _simd_store_si((simdscalari*)aMTRight, bbox.right);
- _simd_store_si((simdscalari*)aMTTop, bbox.top);
- _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-
- // transpose verts needed for backend
- /// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
- vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
- vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
- vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
- vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai[3];
- pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
- simdscalari vRtaii;
- vRtaii = _simd_castps_si(vRtai[0].x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- // scan remaining valid triangles and bin each separately
- while (_BitScanForward(&triIndex, triMask))
- {
- uint32_t linkageCount = state.backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- bool isDegenerate;
- if(CT::IsConservativeT::value)
- {
- // only rasterize valid edges if we have a degenerate primitive
- int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
- work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
- (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
- (rastState.scissorEnable > 0));
-
- // Degenerate triangles are required to be constant interpolated
- isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
- }
- else
- {
- isDegenerate = false;
- work.pfnWork = pfnWork;
- }
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
- desc.triFlags.primID = pPrimID[triIndex];
- desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.pAttribs = pAttribs;
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
-
- // store triangle vertex data
- desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
- ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
- }
-
- for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
- triMask &= ~(1 << triIndex);
- }
-
-endBinTriangles:
- RDTSC_STOP(FEBinTriangles, 1, 0);
-}
-
-struct FEBinTrianglesChooser
-{
- typedef PFN_PROCESS_PRIMS FuncType;
-
- template <typename... ArgsB>
- static FuncType GetFunc()
- {
- return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
- }
-};
-
-// Selector for correct templated BinTrinagles function
-PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
-{
- return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD points to the backend. Only supports point size of 1
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains point position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each point.
-void BinPoints(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector prim[3],
- uint32_t primMask,
- simdscalari primID)
-{
- RDTSC_START(FEBinPoints);
-
- simdvector& primVerts = prim[0];
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_FRONTEND_STATE& feState = state.frontendState;
- const SWR_GS_STATE& gsState = state.gsState;
- const SWR_RASTSTATE& rastState = state.rastState;
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
- if (!feState.vpTransformDisable)
- {
- // perspective divide
- simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
- primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
- primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
- primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
-
- // viewport transform to screen coords
- viewportTransform<1>(&primVerts, state.vpMatrices);
- }
-
- // adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- primVerts.x = _simd_add_ps(primVerts.x, offset);
- primVerts.y = _simd_add_ps(primVerts.y, offset);
-
- // convert to fixed point
- simdscalari vXi, vYi;
- vXi = fpToFixedPointVertical(primVerts.x);
- vYi = fpToFixedPointVertical(primVerts.y);
-
- if (CanUseSimplePoints(pDC))
- {
- // adjust for top-left rule
- vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
- vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
-
- // cull points off the top-left edge of the viewport
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
-
- // compute macro tile coordinates
- simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMacroX, macroX);
- _simd_store_si((simdscalari*)aMacroY, macroY);
-
- // compute raster tile coordinates
- simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
- simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-
- // compute raster tile relative x,y for coverage mask
- simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
- simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
-
- simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
- simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
-
- OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
- _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
-
- OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
- _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
-
- OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
- _simd_store_ps((float*)aZ, primVerts.z);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai;
- pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai.x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- uint32_t *pPrimID = (uint32_t *)&primID;
- DWORD primIndex = 0;
-
- const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
-
- // scan remaining valid triangles and bin each separately
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- // points are always front facing
- desc.triFlags.frontFacing = 1;
- desc.triFlags.primID = pPrimID[primIndex];
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-
- work.pfnWork = RasterizeSimplePoint;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store attributes
- float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
- desc.pAttribs = pAttribs;
- desc.numAttribs = linkageCount;
-
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
-
- // store raster tile aligned x, y, perspective correct z
- float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
- desc.pTriBuffer = pTriBuffer;
- *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
- *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
- *pTriBuffer = aZ[primIndex];
-
- uint32_t tX = aTileRelativeX[primIndex];
- uint32_t tY = aTileRelativeY[primIndex];
-
- // pack the relative x,y into the coverageMask, the rasterizer will
- // generate the true coverage mask from it
- work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
-
- // bin it
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
- }
- primMask &= ~(1 << primIndex);
- }
- }
- else
- {
- // non simple points need to be potentially binned to multiple macro tiles
- simdscalar vPointSize;
- if (rastState.pointParam)
- {
- simdvector size[3];
- pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
- vPointSize = size[0].x;
- }
- else
- {
- vPointSize = _simd_set1_ps(rastState.pointSize);
- }
-
- // bloat point to bbox
- simdBBox bbox;
- bbox.left = bbox.right = vXi;
- bbox.top = bbox.bottom = vYi;
-
- simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
- simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
- bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
- bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
- bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
- bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
- bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
- bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
- bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
- bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-
- // Cull bloated points completely outside scissor
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- primMask = primMask & ~maskOutsideScissor;
-
- // Convert bbox to macrotile units.
- bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.left);
- _simd_store_si((simdscalari*)aMTRight, bbox.right);
- _simd_store_si((simdscalari*)aMTTop, bbox.top);
- _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai[2];
- pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
- _simd_store_ps((float*)aPointSize, vPointSize);
-
- uint32_t *pPrimID = (uint32_t *)&primID;
-
- OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
-
- _simd_store_ps((float*)aPrimVertsX, primVerts.x);
- _simd_store_ps((float*)aPrimVertsY, primVerts.y);
- _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
-
- // scan remaining valid prims and bin each separately
- const SWR_BACKEND_STATE& backendState = state.backendState;
- DWORD primIndex;
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = 1;
- desc.triFlags.primID = pPrimID[primIndex];
- desc.triFlags.pointSize = aPointSize[primIndex];
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-
- work.pfnWork = RasterizeTriPoint;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
- // store point vertex data
- float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
- desc.pTriBuffer = pTriBuffer;
- *pTriBuffer++ = aPrimVertsX[primIndex];
- *pTriBuffer++ = aPrimVertsY[primIndex];
- *pTriBuffer = aPrimVertsZ[primIndex];
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
- ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
- }
-
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
- for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
-
- primMask &= ~(1 << primIndex);
- }
- }
-
-
-
-
- RDTSC_STOP(FEBinPoints, 1, 0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD lines to the backend.
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains line position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each line.
-void BinLines(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector prim[],
- uint32_t primMask,
- simdscalari primID)
-{
- RDTSC_START(FEBinLines);
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const SWR_FRONTEND_STATE& feState = state.frontendState;
- const SWR_GS_STATE& gsState = state.gsState;
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
- simdscalar vRecipW0 = _simd_set1_ps(1.0f);
- simdscalar vRecipW1 = _simd_set1_ps(1.0f);
-
- if (!feState.vpTransformDisable)
- {
- // perspective divide
- vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
- vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
-
- prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
- prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
-
- prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
- prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
-
- prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
- prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
-
- // viewport transform to screen coords
- viewportTransform<2>(prim, state.vpMatrices);
- }
-
- // adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- prim[0].x = _simd_add_ps(prim[0].x, offset);
- prim[0].y = _simd_add_ps(prim[0].y, offset);
-
- prim[1].x = _simd_add_ps(prim[1].x, offset);
- prim[1].y = _simd_add_ps(prim[1].y, offset);
-
- // convert to fixed point
- simdscalari vXi[2], vYi[2];
- vXi[0] = fpToFixedPointVertical(prim[0].x);
- vYi[0] = fpToFixedPointVertical(prim[0].y);
- vXi[1] = fpToFixedPointVertical(prim[1].x);
- vYi[1] = fpToFixedPointVertical(prim[1].y);
-
- // compute x-major vs y-major mask
- simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
- simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
- simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
- uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
-
- // cull zero-length lines
- simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
- vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
-
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
-
- uint32_t *pPrimID = (uint32_t *)&primID;
-
- simdscalar vUnused = _simd_setzero_ps();
-
- // Calc bounding box of lines
- simdBBox bbox;
- bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
- bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
- bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
- bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
-
- // bloat bbox by line width along minor axis
- simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
- simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
- simdBBox bloatBox;
- bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
- bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
- bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
- bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
-
- bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
- bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
- bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
- bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
- bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
- bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
- bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
- bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
-
- // Cull prims completely outside scissor
- {
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- primMask = primMask & ~maskOutsideScissor;
- }
-
- if (!primMask)
- {
- goto endBinLines;
- }
-
- // Convert triangle bbox to macrotile units.
- bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.left);
- _simd_store_si((simdscalari*)aMTRight, bbox.right);
- _simd_store_si((simdscalari*)aMTTop, bbox.top);
- _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
-
- // transpose verts needed for backend
- /// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
- vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
- vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
- vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
- vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
- {
- simdvector vRtai[2];
- pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- // scan remaining valid prims and bin each separately
- DWORD primIndex;
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = state.backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = 1;
- desc.triFlags.primID = pPrimID[primIndex];
- desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-
- work.pfnWork = RasterizeLine;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
- // store line vertex data
- desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
- ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
- }
-
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
- for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
-
- primMask &= ~(1 << primIndex);
- }
-
-endBinLines:
-
- RDTSC_STOP(FEBinLines, 1, 0);
+ return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
+ IsCutIndexEnabled,
+ HasTessellation,
+ HasGeometryShader,
+ HasStreamOut,
+ HasRasterization);
}