#define _simd_add_epi8 _simdemu_add_epi8
#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
+#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
#define _simd_movemask_epi8 _simdemu_movemask_epi8
SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
+#define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
#define _simd_movemask_epi8 _mm256_movemask_epi8
#endif
return vplaneps(vA, vB, vC, vI, vJ);
}
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pdep_u32(a, mask);
+#else
+ UINT result = 0;
+
+ // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
+ // using bsf instead of funky loop
+ DWORD maskIndex;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. populate LSB from src
+ const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+ // 3. copy bit from mask
+ result |= LSB & lowest;
+
+ // 4. clear lowest bit
+ mask &= ~lowest;
+
+ // 5. prepare for next iteration
+ a >>= 1;
+ }
+
+ return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pext_u32(a, mask);
+#else
+ UINT result = 0;
+ DWORD maskIndex;
+ uint32_t currentBit = 0;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. copy bit from mask
+ result |= ((a & lowest) > 0) << currentBit++;
+
+ // 3. clear lowest bit
+ mask &= ~lowest;
+ }
+ return result;
+#endif
+}
#endif//__SWR_SIMDINTRIN_H__
pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
}
}
-
+// templated backend function tables
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
void SetupPipeline(DRAW_CONTEXT *pDC)
{
DRAW_STATE* pState = pDC->pState;
const SWR_RASTSTATE &rastState = pState->state.rastState;
+ const SWR_PS_STATE &psState = pState->state.psState;
BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
// setup backend
- if (pState->state.psState.pfnPixelShader == nullptr)
+ if (psState.pfnPixelShader == nullptr)
{
backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
// always need to generate I & J per sample for Z interpolation
else
{
const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
- const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+ const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
// currently only support 'normal' input coverage
- SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
- pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+ SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+ psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
- SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+ SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
// select backend function
- switch(pState->state.psState.shadingRate)
+ switch(psState.shadingRate)
{
case SWR_SHADING_RATE_PIXEL:
if(bMultisampleEnable)
{
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
}
else
{
// always need to generate I & J per pixel for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
- backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+ backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
}
break;
case SWR_SHADING_RATE_SAMPLE:
SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
break;
- case SWR_SHADING_RATE_COARSE:
default:
SWR_ASSERT(0 && "Invalid shading rate");
break;
uint32_t numRTs = pState->state.psState.numRenderTargets;
pState->state.colorHottileEnable = 0;
- if(pState->state.psState.pfnPixelShader != nullptr)
+ if (psState.pfnPixelShader != nullptr)
{
for (uint32_t rt = 0; rt < numRTs; ++rt)
{
}
#if KNOB_SIMD_WIDTH == 8
-const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
+const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
#else
#error Unsupported vector width
#endif
return _simd_movemask_ps(vClipMask);
}
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-{
-
- // will need to update for avx512
- assert(KNOB_SIMD_WIDTH == 8);
-
- __m256i mask[2];
- __m256i sampleCoverage[2];
- if(bIsStandardPattern)
- {
- __m256i src = _mm256_set1_epi32(0);
- __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
- if(MultisampleTraits<sampleCountT>::numSamples == 1)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 2)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 4)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 8)
- {
- mask[0] = _mm256_set1_epi32(-1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 16)
- {
- mask[0] = _mm256_set1_epi32(-1);
- mask[1] = _mm256_set1_epi32(-1);
- index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
- }
-
- // gather coverage for samples 0-7
- sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // gather coverage for samples 8-15
- sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
- }
- }
- else
- {
- // center coverage is the same for all samples; just broadcast to the sample slots
- uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
- if(MultisampleTraits<sampleCountT>::numSamples == 1)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 2)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 4)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 8)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 16)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
- }
- }
-
- mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
- // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
- __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
- __m256i packedCoverage1;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
- packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
- }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
- __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
- __m256i packedSampleCoverage;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
- shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
- packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
- packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
-#else
- __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
- __m256i packedSampleCoverage;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
- // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
- packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
-#endif
-
- for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
- {
- // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
- inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
- if(!bForcedSampleCount)
- {
- // input coverage has to be anded with sample mask if MSAA isn't forced on
- inputMask[i] &= sampleMask;
- }
-
- // shift to the next pixel in the 4x2
- packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
- }
-}
-
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
- uint32_t inputMask[KNOB_SIMD_WIDTH];
- generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
- inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
-
template<bool perspMask>
INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
{
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
// pixel center
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
if(coverageMask & MASK)
{
RDTSC_START(BEBarycentric);
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// pixel center
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
// pixel center
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// pixel center
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
RDTSC_START(BEBarycentric);
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// set pixel center positions
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
if (bInputCoverage)
{
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
// UL pixel corners
- simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// iterate over active samples
unsigned long sample = 0;
#pragma once
#include "common/os.h"
-#include "core/context.h"
+#include "core/context.h"
+#include "core/multisample.h"
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
void InitClearTilesTable();
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
+void InitBackendFuncTables();
+void InitCPSFuncTables();
enum SWR_BACKEND_FUNCS
{
SWR_BACKEND_MSAA_SAMPLE_RATE,
SWR_BACKEND_FUNCS_MAX,
};
-void InitBackendFuncTables();
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
-extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
+#if KNOB_SIMD_WIDTH == 8
+extern const __m256 vCenterOffsetsX;
+extern const __m256 vCenterOffsetsY;
+extern const __m256 vULOffsetsX;
+extern const __m256 vULOffsetsY;
+#define MASK 0xff
+#endif
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+ // will need to update for avx512
+ assert(KNOB_SIMD_WIDTH == 8);
+
+ __m256i mask[2];
+ __m256i sampleCoverage[2];
+ if(bIsStandardPattern)
+ {
+ __m256i src = _mm256_set1_epi32(0);
+ __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ mask[1] = _mm256_set1_epi32(-1);
+ index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+ }
+
+ // gather coverage for samples 0-7
+ sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // gather coverage for samples 8-15
+ sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+ }
+ }
+ else
+ {
+ // center coverage is the same for all samples; just broadcast to the sample slots
+ uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+ }
+ }
+
+ mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+ // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+ __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+ __m256i packedCoverage1;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+ packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+ }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+ __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+ shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+ packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+ packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#else
+ __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+ // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+ packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#endif
+
+ for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+ {
+ // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+ inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+ if(!bForcedSampleCount)
+ {
+ // input coverage has to be anded with sample mask if MSAA isn't forced on
+ inputMask[i] &= sampleMask;
+ }
+
+ // shift to the next pixel in the 4x2
+ packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+ }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+ generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+ inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
float *pUserClipBuffer;
uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+ uint64_t anyCoveredSamples;
TRI_FLAGS triFlags;
};
for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
{
- uint64_t anyCoveredSamples = 0;
+ triDesc.anyCoveredSamples = 0;
// is the corner of the edge outside of the raster tile? (vEdge < 0)
int mask0, mask1, mask2;
triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
if ((mask0 & mask1 & mask2) == 0xf)
{
- anyCoveredSamples = triDesc.coverageMask[sampleNum];
+ triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
// trivial accept, all 4 corners of all 3 edges are negative
// i.e. raster tile completely inside triangle
RDTSC_EVENT(BETrivialAccept, 1, 0);
}
RDTSC_STOP(BERasterizePartial, 0, 0);
- anyCoveredSamples |= triDesc.coverageMask[sampleNum];
+ triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
}
}
else
}
else
#endif
- if(anyCoveredSamples)
+ if(triDesc.anyCoveredSamples)
{
RDTSC_START(BEPixelBackend);
backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
simdscalar centroid;
};
+#define SWR_MAX_NUM_MULTISAMPLES 16
+
//////////////////////////////////////////////////////////////////////////
/// SWR_PS_CONTEXT
/// @brief Input to pixel shader.
uint32_t frontFace; // IN: front- 1, back- 0
uint32_t primID; // IN: primitive ID
uint32_t sampleIndex; // IN: sampleIndex
+
};
//////////////////////////////////////////////////////////////////////////
};
static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
-#define SWR_MAX_NUM_MULTISAMPLES 16
enum SWR_MULTISAMPLE_COUNT
{
SWR_MULTISAMPLE_1X = 0,
typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
//////////////////////////////////////////////////////////////////////////
uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
};
+
union SWR_DEPTH_STENCIL_STATE
{
struct
{
SWR_SHADING_RATE_PIXEL,
SWR_SHADING_RATE_SAMPLE,
- SWR_SHADING_RATE_COARSE,
SWR_SHADING_RATE_MAX,
};
uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with
uint32_t usesUAV : 1; // pixel shader accesses UAV
uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test
+
};
#pragma once
#include "core/state.h"
+#include "common/simdintrin.h"
template<SWR_TILE_MODE mode, int>
struct TilingTraits
static UINT GetPdepY() { return 0x1ea; }
};
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
- return _pdep_u32(a, mask);
-#else
- UINT result = 0;
-
- // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
- // using bsf instead of funky loop
- DWORD maskIndex;
- while (_BitScanForward(&maskIndex, mask))
- {
- // 1. isolate lowest set bit of mask
- const UINT lowest = 1 << maskIndex;
-
- // 2. populate LSB from src
- const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
- // 3. copy bit from mask
- result |= LSB & lowest;
-
- // 4. clear lowest bit
- mask &= ~lowest;
-
- // 5. prepare for next iteration
- a >>= 1;
- }
-
- return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
- return _pext_u32(a, mask);
-#else
- UINT result = 0;
- DWORD maskIndex;
- uint32_t currentBit = 0;
- while (_BitScanForward(&maskIndex, mask))
- {
- // 1. isolate lowest set bit of mask
- const UINT lowest = 1 << maskIndex;
-
- // 2. copy bit from mask
- result |= ((a & lowest) > 0) << currentBit++;
-
- // 3. clear lowest bit
- mask &= ~lowest;
- }
- return result;
-#endif
-}
-
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the tileID for 2D tiled surfaces
/// @param pitch - surface pitch in bytes