From 117fc582f86564b4c37db248b3231b8d86da0039 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Tue, 28 Mar 2017 15:32:04 -0500 Subject: [PATCH] swr: [rasterizer core] Programmable sample position support Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/codegen/gen_llvm_types.py | 22 + .../swr/rasterizer/common/simdintrin.h | 7 + .../drivers/swr/rasterizer/core/api.cpp | 8 +- .../drivers/swr/rasterizer/core/backend.cpp | 43 +- .../drivers/swr/rasterizer/core/backend.h | 141 ++--- .../drivers/swr/rasterizer/core/binner.cpp | 12 +- .../swr/rasterizer/core/multisample.cpp | 13 - .../drivers/swr/rasterizer/core/multisample.h | 500 +++--------------- .../swr/rasterizer/core/rasterizer.cpp | 14 +- .../drivers/swr/rasterizer/core/rasterizer.h | 3 +- .../drivers/swr/rasterizer/core/state.h | 98 +++- src/gallium/drivers/swr/swr_state.cpp | 2 - 12 files changed, 267 insertions(+), 596 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py index 1e9593a1af1..4cabde3394f 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_types.py @@ -60,6 +60,8 @@ def gen_llvm_type(type, name, is_pointer, is_pointer_pointer, is_array, is_array llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)' elif type == 'simdscalari': llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)' + elif type == '__m128i': + llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), 4)' elif type == 'SIMD8::vector_t': llvm_type = 'VectorType::get(Type::getFloatTy(ctx), 8)' elif type == 'SIMD8::vectori_t': @@ -145,6 +147,26 @@ def gen_llvm_types(input_file, output_file): else: is_llvm_struct = False + ########################################### + # Is field the start of a function? Tells script to ignore it + is_llvm_func_start = re.search(r'@llvm_func_start', line) + + if is_llvm_func_start is not None: + while not end_of_struct and idx < len(lines)-1: + idx += 1 + line = lines[idx].rstrip() + is_llvm_func_end = re.search(r'@llvm_func_end', line) + if is_llvm_func_end is not None: + break; + continue + + ########################################### + # Is field a function? Tells script to ignore it + is_llvm_func = re.search(r'@llvm_func', line) + + if is_llvm_func is not None: + continue + ########################################### # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type. is_llvm_enum = re.search(r'@llvm_enum', line) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 3cf3b180200..1e3f14ce59a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -648,6 +648,13 @@ simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask) return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask))); } +template +INLINE +__m128i _simd_blend4_epi32(__m128i a, __m128i b) +{ + return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), mask)); +} + // convert bitmask to vector mask INLINE simdscalar vMask(int32_t mask) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index dabd0616d3b..1710cc66793 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -793,7 +793,6 @@ void SetupPipeline(DRAW_CONTEXT *pDC) const SWR_RASTSTATE &rastState = pState->state.rastState; const SWR_PS_STATE &psState = pState->state.psState; BACKEND_FUNCS& backendFuncs = pState->backendFuncs; - const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; // setup backend if (psState.pfnPixelShader == nullptr) @@ -802,7 +801,8 @@ void SetupPipeline(DRAW_CONTEXT *pDC) } else { - const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0; + const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0; + const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0; const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; @@ -815,7 +815,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage] + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern][psState.inputCoverage] [centroid][forcedSampleCount][canEarlyZ] ; } @@ -827,7 +827,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC) } break; case SWR_SHADING_RATE_SAMPLE: - SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); + SWR_ASSERT(rastState.bIsCenterPattern != true); // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ]; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 84414d8e721..b76b36fcbcb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -468,7 +468,8 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); SWR_PS_CONTEXT psContext; - SetupPixelShaderContext(&psContext, work); + const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; + SetupPixelShaderContext(&psContext, samplePos, work); AR_END(BESetup, 1); @@ -517,7 +518,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 CalcPixelBarycentrics(coeffs, psContext); - CalcCentroid(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); + CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); @@ -663,7 +664,8 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); SWR_PS_CONTEXT psContext; - SetupPixelShaderContext(&psContext, work); + const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; + SetupPixelShaderContext(&psContext, samplePos, work); AR_END(BESetup, 0); @@ -696,7 +698,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ CalcPixelBarycentrics(coeffs, psContext); - CalcCentroid(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); + CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); AR_END(BEBarycentric, 0); @@ -725,8 +727,8 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ AR_BEGIN(BEBarycentric, pDC->drawId); // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample)); + psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); + psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); CalcSampleBarycentrics(coeffs, psContext); @@ -870,7 +872,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, AR_BEGIN(BENullBackend, pDC->drawId); ///@todo: handle center multisample pattern - typedef SwrBackendTraits T; + typedef SwrBackendTraits T; AR_BEGIN(BESetup, pDC->drawId); const API_STATE &state = GetApiState(pDC); @@ -889,7 +891,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast(y))); const simdscalar dy = _simd_set1_ps(static_cast(SIMD_TILE_Y_DIM)); - + const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast(x))); @@ -928,8 +930,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, AR_BEGIN(BEBarycentric, pDC->drawId); // calculate per sample positions - psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample)); - psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample)); + psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample)); + psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample)); CalcSampleBarycentrics(coeffs, psContext); @@ -995,7 +997,7 @@ PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] [2] // canEarlyZ = {}; PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [SWR_MSAA_SAMPLE_PATTERN_COUNT] + [2] // isCenterPattern [SWR_INPUT_COVERAGE_COUNT] [2] // centroid [2] // forcedSampleCount @@ -1027,21 +1029,6 @@ struct BEChooser } } - // Recursively parse args - template - static PFN_BACKEND_FUNC GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg, TArgsT... remainingArgs) - { - switch(tArg) - { - case SWR_MSAA_CENTER_PATTERN: return BEChooser::GetFunc(remainingArgs...); break; - case SWR_MSAA_STANDARD_PATTERN: return BEChooser::GetFunc(remainingArgs...); break; - default: - SWR_ASSERT(0 && "Invalid sample pattern\n"); - return BEChooser::GetFunc(remainingArgs...); - break; - } - } - // Recursively parse args template static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) @@ -1098,7 +1085,7 @@ void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COU for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[inputCoverage][isCentroid][canEarlyZ] = - BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage, + BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage, (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); } } @@ -1116,7 +1103,7 @@ void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_C for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[sampleCount][inputCoverage][centroid][canEarlyZ] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage, + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage, (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); } } diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index f022990bf26..82765c2e877 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -48,7 +48,7 @@ void InitCPSFuncTables(); void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext); extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [SWR_MSAA_SAMPLE_PATTERN_COUNT] + [2] // isCenterPattern [SWR_INPUT_COVERAGE_COUNT] [2] // centroid [2] // forcedSampleCount @@ -153,66 +153,67 @@ struct generateInputCoverage __m256i mask[2]; __m256i sampleCoverage[2]; - if(T::bIsStandardPattern) + + if(T::bIsCenterPattern) { - __m256i src = _mm256_set1_epi32(0); - __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - + // center coverage is the same for all samples; just broadcast to the sample slots + uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); if(T::MultisampleT::numSamples == 1) { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); } else if(T::MultisampleT::numSamples == 2) { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); } else if(T::MultisampleT::numSamples == 4) { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); } else if(T::MultisampleT::numSamples == 8) { - mask[0] = _mm256_set1_epi32(-1); + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); } else if(T::MultisampleT::numSamples == 16) { - mask[0] = _mm256_set1_epi32(-1); - mask[1] = _mm256_set1_epi32(-1); - index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); - } - - // gather coverage for samples 0-7 - sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); - if(T::MultisampleT::numSamples > 8) - { - // gather coverage for samples 8-15 - sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); } } else { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); + __m256i src = _mm256_set1_epi32(0); + __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + if(T::MultisampleT::numSamples == 1) { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); } else if(T::MultisampleT::numSamples == 2) { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); } else if(T::MultisampleT::numSamples == 4) { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); } else if(T::MultisampleT::numSamples == 8) { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + mask[0] = _mm256_set1_epi32(-1); } else if(T::MultisampleT::numSamples == 16) { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); + mask[0] = _mm256_set1_epi32(-1); + mask[1] = _mm256_set1_epi32(-1); + index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + } + + // gather coverage for samples 0-7 + sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + if(T::MultisampleT::numSamples > 8) + { + // gather coverage for samples 8-15 + sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); } } @@ -332,7 +333,8 @@ struct generateInputCoverage // SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point. //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template -INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask, +INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos, + const uint64_t *const coverageMask, const uint32_t sampleMask, const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) { uint32_t inputMask[KNOB_SIMD_WIDTH]; @@ -352,23 +354,23 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const cov (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0); // look up and set the sample offsets from UL pixel corner for first covered sample - __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]), - T::MultisampleT::X(sampleNum[6]), - T::MultisampleT::X(sampleNum[5]), - T::MultisampleT::X(sampleNum[4]), - T::MultisampleT::X(sampleNum[3]), - T::MultisampleT::X(sampleNum[2]), - T::MultisampleT::X(sampleNum[1]), - T::MultisampleT::X(sampleNum[0])); - - __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]), - T::MultisampleT::Y(sampleNum[6]), - T::MultisampleT::Y(sampleNum[5]), - T::MultisampleT::Y(sampleNum[4]), - T::MultisampleT::Y(sampleNum[3]), - T::MultisampleT::Y(sampleNum[2]), - T::MultisampleT::Y(sampleNum[1]), - T::MultisampleT::Y(sampleNum[0])); + __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]), + samplePos.X(sampleNum[6]), + samplePos.X(sampleNum[5]), + samplePos.X(sampleNum[4]), + samplePos.X(sampleNum[3]), + samplePos.X(sampleNum[2]), + samplePos.X(sampleNum[1]), + samplePos.X(sampleNum[0])); + + __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]), + samplePos.Y(sampleNum[6]), + samplePos.Y(sampleNum[5]), + samplePos.Y(sampleNum[4]), + samplePos.Y(sampleNum[3]), + samplePos.Y(sampleNum[2]), + samplePos.Y(sampleNum[1]), + samplePos.Y(sampleNum[0])); // add sample offset to UL pixel corner vXSample = _simd_add_ps(vXSamplePosUL, vXSample); vYSample = _simd_add_ps(vYSamplePosUL, vYSample); @@ -398,8 +400,8 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const cov __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples); - vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample)); - vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample)); + vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample)); + vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample)); // blend in case 3a pixel locations psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a)); @@ -494,7 +496,7 @@ inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uin } template -void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC &work) +void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work) { psContext->pAttribs = work.pAttribs; psContext->pPerspAttribs = work.pPerspAttribs; @@ -507,14 +509,15 @@ void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_TRIANGLE_DESC psContext->recipDet = work.recipDet; psContext->pRecipW = work.pRecipW; - psContext->pSamplePosX = reinterpret_cast(&T::MultisampleT::samplePosX); - psContext->pSamplePosY = reinterpret_cast(&T::MultisampleT::samplePosY); + psContext->pSamplePosX = samplePos.X();//reinterpret_cast(&T::MultisampleT::samplePosX); + psContext->pSamplePosY = samplePos.Y();//reinterpret_cast(&T::MultisampleT::samplePosY); psContext->rasterizerSampleCount = T::MultisampleT::numSamples; psContext->sampleIndex = 0; } template -void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask) +void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, + const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask) { if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different { @@ -530,15 +533,15 @@ void CalcCentroid(SWR_PS_CONTEXT *psContext, const BarycentricCoeffs &coeffs, co if (T::bCentroidPos) { ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid - if (T::bIsStandardPattern) + if (T::bIsCenterPattern) { - // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'.. - CalcCentroidPos(*psContext, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL); + psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f)); + psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f)); } else { - psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f)); - psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f)); + // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'.. + CalcCentroidPos(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL); } CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL); @@ -557,8 +560,9 @@ struct PixelRateZTestLoop PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) : pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState), - clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer) {}; - + samplePos(state.rastState.samplePositions), + clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){}; + INLINE uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0) @@ -597,8 +601,8 @@ struct PixelRateZTestLoop AR_BEGIN(BEBarycentric, pDC->drawId); // calculate per sample positions - psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample)); - psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample)); + psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); + psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); // calc I & J per sample CalcSampleBarycentrics(coeffs, psContext); @@ -673,6 +677,7 @@ private: const BarycentricCoeffs& coeffs; const API_STATE& state; const SWR_PS_STATE& psState; + const SWR_MULTISAMPLE_POS& samplePos; const uint8_t clipDistanceMask; uint8_t*& pDepthBuffer; uint8_t*& pStencilBuffer; @@ -862,7 +867,8 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t SetupBarycentricCoeffs(&coeffs, work); SWR_PS_CONTEXT psContext; - SetupPixelShaderContext(&psContext, work); + const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; + SetupPixelShaderContext(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); @@ -887,7 +893,6 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t { #if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); - #endif simdscalar activeLanes; if(!(work.anyCoveredSamples & MASK)) {goto Endtile;}; @@ -904,7 +909,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t CalcPixelBarycentrics(coeffs, psContext); - CalcCentroid(&psContext, coeffs, work.coverageMask, state.blendState.sampleMask); + CalcCentroid(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); AR_END(BEBarycentric, 0); @@ -966,7 +971,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t { AR_BEGIN(BEOutputMerger, pDC->drawId); // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples - uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0; + uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample; simdscalar coverageMask, depthMask; if(T::bForcedSampleCount) { @@ -1045,15 +1050,15 @@ Endtile: AR_END(BEPixelRateBackend, 0); } -template struct SwrBackendTraits { - static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN); + static const bool bIsCenterPattern = (isCenter == 1); static const uint32_t InputCoverage = coverage; static const bool bCentroidPos = (centroid == 1); static const bool bForcedSampleCount = (forced == 1); static const bool bCanEarlyZ = (canEarlyZ == 1); - typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT; + typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 3d42718a374..f00701f8192 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -640,9 +640,8 @@ void BinTriangles( else { // degenerate triangles won't be sent to rasterizer; just enable all edges - pfnWork = GetRasterizerFunc(rastState.sampleCount, (rastState.samplePattern == SWR_MSAA_CENTER_PATTERN), - (rastState.conservativeRast > 0), (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, - (state.scissorsTileAligned == false)); + pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, (state.scissorsTileAligned == false)); } if (!triMask) @@ -658,7 +657,7 @@ void BinTriangles( // only discard for non-MSAA case and when conservative rast is disabled // (xmin + 127) & ~255 // (xmax + 128) & ~255 - if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.samplePattern == SWR_MSAA_CENTER_PATTERN) && + if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) && (!CT::IsConservativeT::value)) { origTriMask = triMask; @@ -787,9 +786,8 @@ endBinTriangles: { // only rasterize valid edges if we have a degenerate primitive int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; - work.pfnWork = GetRasterizerFunc(rastState.sampleCount, (rastState.samplePattern == SWR_MSAA_CENTER_PATTERN), - (rastState.conservativeRast > 0), (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, - (state.scissorsTileAligned == false)); + work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, (state.scissorsTileAligned == false)); // Degenerate triangles are required to be constant interpolated isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp index 94992e30765..88a0ef76144 100644 --- a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp @@ -50,16 +50,3 @@ const float MultisampleTraits::samplePosX[16] {0.5625, 0.4375, 0.3125, 0.7500, 0.1875, 0.6250, 0.8125, 0.6875, 0.3750, 0.5000, 0.2500, 0.1250, 0.0000, 0.9375, 0.8750, 0.0625}; const float MultisampleTraits::samplePosY[16] {0.5625, 0.3125, 0.6250, 0.4375, 0.3750, 0.8125, 0.6875, 0.1875, 0.8750, 0.0625, 0.1250, 0.7500, 0.5000, 0.2500, 0.9375, 0.0000}; - -const float MultisampleTraits::samplePosX{ 0.5f }; -const float MultisampleTraits::samplePosY{ 0.5f }; -const float MultisampleTraits::samplePosX[2]{ 0.5f, 0.5f}; -const float MultisampleTraits::samplePosY[2]{ 0.5f, 0.5f}; -const float MultisampleTraits::samplePosX[4]{ 0.5f, 0.5f, 0.5f, 0.5f}; -const float MultisampleTraits::samplePosY[4]{ 0.5f, 0.5f, 0.5f, 0.5f }; -const float MultisampleTraits::samplePosX[8]{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; -const float MultisampleTraits::samplePosY[8]{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; -const float MultisampleTraits::samplePosX[16] -{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; -const float MultisampleTraits::samplePosY[16] -{ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h index 55387a2ec6e..dc2dde9e2b0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/multisample.h +++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h @@ -58,70 +58,21 @@ SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples) // hardcoded offsets based on Direct3d standard multisample positions // 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner // coords are 0.8 fixed point offsets from (0, 0) -template +template struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) = delete; - INLINE static __m128i vYi(uint32_t sampleNum) = delete; - INLINE static simdscalar vX(uint32_t sampleNum) = delete; - INLINE static simdscalar vY(uint32_t sampleNum) = delete; INLINE static float X(uint32_t sampleNum) = delete; INLINE static float Y(uint32_t sampleNum) = delete; - INLINE static __m128i TileSampleOffsetsX() = delete; - INLINE static __m128i TileSampleOffsetsY() = delete; INLINE static simdscalari FullSampleMask() = delete; static const uint32_t numSamples = 0; }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - static const __m128i X = _mm_set1_epi32(samplePosXi); - return X; - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - static const __m128i Y = _mm_set1_epi32(samplePosYi); - return Y; - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - static const simdscalar X = _simd_set1_ps(0.5f); - return X; - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - static const simdscalar Y = _simd_set1_ps(0.5f); - return Y; - } - INLINE static float X(uint32_t sampleNum) {return samplePosX;}; INLINE static float Y(uint32_t sampleNum) {return samplePosY;}; - - INLINE static __m128i TileSampleOffsetsX() - { - static const uint32_t bboxLeftEdge = 0x80; - static const uint32_t bboxRightEdge = 0x80; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); - return tileSampleOffsetX; - } - - INLINE static __m128i TileSampleOffsetsY() - { - static const uint32_t bboxTopEdge = 0x80; - static const uint32_t bboxBottomEdge = 0x80; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); - return tileSampleOffsetY; - } - INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);}; static const uint32_t samplePosXi; @@ -134,43 +85,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; - - INLINE static __m128i TileSampleOffsetsX() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i TileSampleOffsetsY() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);}; static const uint32_t numSamples = 1; @@ -181,57 +99,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - static const __m128i X[numSamples] {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1])}; - return X[sampleNum]; - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - SWR_ASSERT(sampleNum < numSamples); - static const __m128i Y[numSamples] {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1])}; - return Y[sampleNum]; - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; - assert(sampleNum < numSamples); - return X[sampleNum]; - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)}; - assert(sampleNum < numSamples); - return Y[sampleNum]; - } - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; - - INLINE static __m128i TileSampleOffsetsX() - { - static const uint32_t bboxLeftEdge = 0x40; - static const uint32_t bboxRightEdge = 0xC0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); - return tileSampleOffsetX; - } - - INLINE static __m128i TileSampleOffsetsY() - { - static const uint32_t bboxTopEdge = 0x40; - static const uint32_t bboxBottomEdge = 0xC0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); - return tileSampleOffsetY; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask =_simd_set1_epi32(0x3); @@ -248,43 +119,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; - - INLINE static __m128i TileSampleOffsetsX() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i TileSampleOffsetsY() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask =_simd_set1_epi32(0x3); @@ -298,61 +136,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - static const __m128i X[numSamples] - {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3])}; - SWR_ASSERT(sampleNum < numSamples); - return X[sampleNum]; - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - static const __m128i Y[numSamples] - {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3])}; - SWR_ASSERT(sampleNum < numSamples); - return Y[sampleNum]; - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - static const simdscalar X[numSamples] - {_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)}; - assert(sampleNum < numSamples); - return X[sampleNum]; - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - static const simdscalar Y[numSamples] - {_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)}; - assert(sampleNum < numSamples); - return Y[sampleNum]; - } - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; - - INLINE static __m128i TileSampleOffsetsX() - { - static const uint32_t bboxLeftEdge = 0x20; - static const uint32_t bboxRightEdge = 0xE0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); - return tileSampleOffsetX; - } - - INLINE static __m128i TileSampleOffsetsY() - { - static const uint32_t bboxTopEdge = 0x20; - static const uint32_t bboxBottomEdge = 0xE0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); - return tileSampleOffsetY; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xF); @@ -369,48 +156,16 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; - - INLINE static __m128i TileSampleOffsetsX() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i TileSampleOffsetsY() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xF); return mask; } + static const uint32_t numSamples = 4; static const float samplePosX[4]; static const float samplePosY[4]; @@ -419,65 +174,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - static const __m128i X[numSamples] - {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]), - _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7])}; - SWR_ASSERT(sampleNum < numSamples); - return X[sampleNum]; - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - static const __m128i Y[numSamples] - {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]), - _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7])}; - SWR_ASSERT(sampleNum < numSamples); - return Y[sampleNum]; - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - static const simdscalar X[numSamples] - {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125), - _simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)}; - assert(sampleNum < numSamples); - return X[sampleNum]; - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - static const simdscalar Y[numSamples] - {_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875), - _simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)}; - assert(sampleNum < numSamples); - return Y[sampleNum]; - } - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; - - INLINE static __m128i TileSampleOffsetsX() - { - static const uint32_t bboxLeftEdge = 0x10; - static const uint32_t bboxRightEdge = 0xF0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); - return tileSampleOffsetX; - } - - INLINE static __m128i TileSampleOffsetsY() - { - static const uint32_t bboxTopEdge = 0x10; - static const uint32_t bboxBottomEdge = 0xF0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); - return tileSampleOffsetY; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFF); @@ -494,43 +194,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; - - INLINE static __m128i TileSampleOffsetsX() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i TileSampleOffsetsY() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFF); @@ -544,73 +211,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - static const __m128i X[numSamples] - {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]), - _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7]), - _mm_set1_epi32(samplePosXi[8]), _mm_set1_epi32(samplePosXi[9]), _mm_set1_epi32(samplePosXi[10]), _mm_set1_epi32(samplePosXi[11]), - _mm_set1_epi32(samplePosXi[12]), _mm_set1_epi32(samplePosXi[13]), _mm_set1_epi32(samplePosXi[14]), _mm_set1_epi32(samplePosXi[15])}; - SWR_ASSERT(sampleNum < numSamples); - return X[sampleNum]; - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - static const __m128i Y[numSamples] - {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]), - _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7]), - _mm_set1_epi32(samplePosYi[8]), _mm_set1_epi32(samplePosYi[9]), _mm_set1_epi32(samplePosYi[10]), _mm_set1_epi32(samplePosYi[11]), - _mm_set1_epi32(samplePosYi[12]), _mm_set1_epi32(samplePosYi[13]), _mm_set1_epi32(samplePosYi[14]), _mm_set1_epi32(samplePosYi[15])}; - SWR_ASSERT(sampleNum < numSamples); - return Y[sampleNum]; - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - static const simdscalar X[numSamples] - {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500), - _simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), - _simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250), - _simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)}; - assert(sampleNum < numSamples); - return X[sampleNum]; - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - static const simdscalar Y[numSamples] - {_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375), - _simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875), - _simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500), - _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)}; - assert(sampleNum < numSamples); - return Y[sampleNum]; - } - INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; }; INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; }; - - INLINE static __m128i TileSampleOffsetsX() - { - static const uint32_t bboxLeftEdge = 0x00; - static const uint32_t bboxRightEdge = 0xF0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge); - return tileSampleOffsetX; - } - - INLINE static __m128i TileSampleOffsetsY() - { - static const uint32_t bboxTopEdge = 0x00; - static const uint32_t bboxBottomEdge = 0xF0; - // BR, BL, UR, UL - static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge); - return tileSampleOffsetY; - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFFFF); @@ -627,43 +231,10 @@ struct MultisampleTraits }; template<> -struct MultisampleTraits +struct MultisampleTraits { - INLINE static __m128i vXi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i vYi(uint32_t sampleNum) - { - return _mm_set1_epi32(0x80); - } - - INLINE static simdscalar vX(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - - INLINE static simdscalar vY(uint32_t sampleNum) - { - return _simd_set1_ps(0.5f); - } - INLINE static float X(uint32_t sampleNum) {return 0.5f;}; INLINE static float Y(uint32_t sampleNum) {return 0.5f;}; - - INLINE static __m128i TileSampleOffsetsX() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - - INLINE static __m128i TileSampleOffsetsY() - { - // BR, BL, UR, UL - return _mm_set1_epi32(0x80); - } - INLINE static simdscalari FullSampleMask() { static const simdscalari mask = _simd_set1_epi32(0xFFFF); @@ -675,3 +246,50 @@ struct MultisampleTraits static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X; static const uint32_t numCoverageSamples = 1; }; + +INLINE +bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount, const SWR_MULTISAMPLE_POS& samplePos) +{ + // detect if we're using standard or center sample patterns + const uint32_t *standardPosX, *standardPosY; + switch(sampleCount) + { + case SWR_MULTISAMPLE_1X: + standardPosX = &MultisampleTraits::samplePosXi; + standardPosY = &MultisampleTraits::samplePosYi; + break; + case SWR_MULTISAMPLE_2X: + standardPosX = MultisampleTraits::samplePosXi; + standardPosY = MultisampleTraits::samplePosYi; + break; + case SWR_MULTISAMPLE_4X: + standardPosX = MultisampleTraits::samplePosXi; + standardPosY = MultisampleTraits::samplePosYi; + break; + case SWR_MULTISAMPLE_8X: + standardPosX = MultisampleTraits::samplePosXi; + standardPosY = MultisampleTraits::samplePosYi; + break; + case SWR_MULTISAMPLE_16X: + standardPosX = MultisampleTraits::samplePosXi; + standardPosY = MultisampleTraits::samplePosYi; + break; + default: + break; + } + + // scan sample pattern for standard or center + uint32_t numSamples = GetNumSamples(sampleCount); + bool bIsStandard = true; + if(numSamples > 1) + { + for(uint32_t i = 0; i < numSamples; i++) + { + bIsStandard = (standardPosX[i] == samplePos.Xi(i)) || + (standardPosY[i] == samplePos.Yi(i)); + if(!bIsStandard) + break; + } + } + return !bIsStandard; +} \ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index d0fdf4882ff..0837841746e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -1118,8 +1118,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m256d vEdgeTileBbox[3]; if (NumCoverageSamplesT::value > 1) { - __m128i vTileSampleBBoxXh = RT::MT::TileSampleOffsetsX(); - __m128i vTileSampleBBoxYh = RT::MT::TileSampleOffsetsY(); + const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; + const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX(); + const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY(); __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh); __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh); @@ -1206,8 +1207,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } else { - __m128i vSampleOffsetXh = RT::MT::vXi(sampleNum); - __m128i vSampleOffsetYh = RT::MT::vYi(sampleNum); + const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions; + __m128i vSampleOffsetXh = samplePos.vXi(sampleNum); + __m128i vSampleOffsetYh = samplePos.vYi(sampleNum); __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh); __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh); @@ -1340,7 +1342,7 @@ void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, // setup triangle rasterizer function PFN_WORK_FUNC pfnTriRast; // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, (rastState.samplePattern == SWR_MSAA_CENTER_PATTERN), false, + pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false)); // overwrite texcoords for point sprites @@ -1673,7 +1675,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi // setup triangle rasterizer function PFN_WORK_FUNC pfnTriRast; // conservative rast not supported for points/lines - pfnTriRast = GetRasterizerFunc(rastState.sampleCount, (rastState.samplePattern == SWR_MSAA_CENTER_PATTERN), false, + pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, SWR_INPUT_COVERAGE_NONE, ALL_EDGES_VALID, (pDC->pState->state.scissorsTileAligned == false)); // make sure this macrotile intersects the triangle diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h index 96b12ae4196..f4aa6eb9f90 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h @@ -115,8 +115,7 @@ template , public RasterEdgeTraits> { - typedef MultisampleTraits(NumSamplesT::value), - (CenterPatternT::value ? SWR_MSAA_CENTER_PATTERN : SWR_MSAA_STANDARD_PATTERN)> MT; + typedef MultisampleTraits(NumSamplesT::value), CenterPatternT::value> MT; /// Fixed point precision the rasterizer is using typedef FixedPointTraits PrecisionT; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 623e70a1519..eec68cd468b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -29,6 +29,8 @@ #include "common/formats.h" #include "common/simdintrin.h" +#include +#include ////////////////////////////////////////////////////////////////////////// /// PRIMITIVE_TOPOLOGY. @@ -333,8 +335,7 @@ struct SWR_PS_CONTEXT uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer - uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; - // IN: Pointers to render target hottiles + uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render target hottiles }; ////////////////////////////////////////////////////////////////////////// @@ -909,13 +910,6 @@ enum SWR_FRONTWINDING }; -enum SWR_MSAA_SAMPLE_PATTERN -{ - SWR_MSAA_CENTER_PATTERN, - SWR_MSAA_STANDARD_PATTERN, - SWR_MSAA_SAMPLE_PATTERN_COUNT -}; - enum SWR_PIXEL_LOCATION { SWR_PIXEL_LOCATION_CENTER, @@ -925,16 +919,75 @@ enum SWR_PIXEL_LOCATION // fixed point screen space sample locations within a pixel struct SWR_MULTISAMPLE_POS { - uint32_t x; - uint32_t y; -}; +public: + INLINE void SetXi(uint32_t sampleNum, uint32_t val) { _xi[sampleNum] = val; }; // @llvm_func + INLINE void SetYi(uint32_t sampleNum, uint32_t val) { _yi[sampleNum] = val; }; // @llvm_func + INLINE uint32_t Xi(uint32_t sampleNum) const { return _xi[sampleNum]; }; // @llvm_func + INLINE uint32_t Yi(uint32_t sampleNum) const { return _yi[sampleNum]; }; // @llvm_func + INLINE void SetX(uint32_t sampleNum, float val) { _x[sampleNum] = val; }; // @llvm_func + INLINE void SetY(uint32_t sampleNum, float val) { _y[sampleNum] = val; }; // @llvm_func + INLINE float X(uint32_t sampleNum) const { return _x[sampleNum]; }; // @llvm_func + INLINE float Y(uint32_t sampleNum) const { return _y[sampleNum]; }; // @llvm_func + typedef const float(&sampleArrayT)[SWR_MAX_NUM_MULTISAMPLES]; //@llvm_typedef + INLINE sampleArrayT X() const { return _x; }; // @llvm_func + INLINE sampleArrayT Y() const { return _y; }; // @llvm_func + INLINE const __m128i& vXi(uint32_t sampleNum) const { return _vXi[sampleNum]; }; // @llvm_func + INLINE const __m128i& vYi(uint32_t sampleNum) const { return _vYi[sampleNum]; }; // @llvm_func + INLINE const simdscalar& vX(uint32_t sampleNum) const { return _vX[sampleNum]; }; // @llvm_func + INLINE const simdscalar& vY(uint32_t sampleNum) const { return _vY[sampleNum]; }; // @llvm_func + INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; }; // @llvm_func + INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; }; // @llvm_func + + INLINE void PrecalcSampleData(int numSamples) // @llvm_func_start + { + for(int i = 0; i < numSamples; i++) + { + _vXi[i] = _mm_set1_epi32(_xi[i]); + _vYi[i] = _mm_set1_epi32(_yi[i]); + _vX[i] = _simd_set1_ps(_x[i]); + _vY[i] = _simd_set1_ps(_y[i]); + } + // precalculate the raster tile BB for the rasterizer. + CalcTileSampleOffsets(numSamples); + } // @llvm_func_end + + +private: + INLINE void CalcTileSampleOffsets(int numSamples) // @llvm_func_start + { + auto expandThenBlend4 = [](uint32_t* min, uint32_t* max, auto mask) + { + __m128i vMin = _mm_set1_epi32(*min); + __m128i vMax = _mm_set1_epi32(*max); + return _simd_blend4_epi32(vMin, vMax); + }; + + auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]); + auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]); + std::integral_constant xMask; + // BR(max), BL(min), UR(max), UL(min) + tileSampleOffsetsX = expandThenBlend4(minXi, maxXi, xMask); + + auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]); + auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]); + std::integral_constant yMask; + // BR(max), BL(min), UR(max), UL(min) + tileSampleOffsetsY = expandThenBlend4(minYi, maxYi, yMask); + }; // @llvm_func_end + // scalar sample values + uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES]; + uint32_t _yi[SWR_MAX_NUM_MULTISAMPLES]; + float _x[SWR_MAX_NUM_MULTISAMPLES]; + float _y[SWR_MAX_NUM_MULTISAMPLES]; + + // precalc'd / vectorized samples + __m128i _vXi[SWR_MAX_NUM_MULTISAMPLES]; + __m128i _vYi[SWR_MAX_NUM_MULTISAMPLES]; + simdscalar _vX[SWR_MAX_NUM_MULTISAMPLES]; + simdscalar _vY[SWR_MAX_NUM_MULTISAMPLES]; + __m128i tileSampleOffsetsX; + __m128i tileSampleOffsetsY; -enum SWR_MSAA_RASTMODE -{ - SWR_MSAA_RASTMODE_OFF_PIXEL, - SWR_MSAA_RASTMODE_OFF_PATTERN, - SWR_MSAA_RASTMODE_ON_PIXEL, - SWR_MSAA_RASTMODE_ON_PATTERN }; ////////////////////////////////////////////////////////////////////////// @@ -951,7 +1004,6 @@ struct SWR_RASTSTATE uint32_t pointParam : 1; uint32_t pointSpriteEnable : 1; uint32_t pointSpriteTopOrigin : 1; - uint32_t msaaRastEnable : 1; uint32_t forcedSampleCount : 1; uint32_t pixelOffset : 1; uint32_t depthBiasPreAdjusted : 1; ///< depth bias constant is in float units, not per-format Z units @@ -965,15 +1017,11 @@ struct SWR_RASTSTATE float depthBiasClamp; SWR_FORMAT depthFormat; // @llvm_enum - ///@todo: MSAA lines - // multisample state for MSAA lines - SWR_MSAA_RASTMODE rastMode; // @llvm_enum - // sample count the rasterizer is running at SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum uint32_t pixelLocation; // UL or Center - SWR_MULTISAMPLE_POS iSamplePos[SWR_MAX_NUM_MULTISAMPLES]; - SWR_MSAA_SAMPLE_PATTERN samplePattern; // @llvm_enum + SWR_MULTISAMPLE_POS samplePositions; // @llvm_struct + bool bIsCenterPattern; // @llvm_enum // user clip/cull distance enables uint8_t cullDistanceMask; diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp index efd2b4ae9a5..5cc01ddcab0 100644 --- a/src/gallium/drivers/swr/swr_state.cpp +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -1061,8 +1061,6 @@ swr_update_derived(struct pipe_context *pipe, rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT; /* XXX TODO: Add multisample */ - rastState->msaaRastEnable = false; - rastState->rastMode = SWR_MSAA_RASTMODE_OFF_PIXEL; rastState->sampleCount = SWR_MULTISAMPLE_1X; rastState->forcedSampleCount = false; -- 2.30.2