swr: [rasterizer core] implement InnerConservative input coverage
authorTim Rowley <timothy.o.rowley@intel.com>
Thu, 28 Jul 2016 22:25:09 +0000 (16:25 -0600)
committerTim Rowley <timothy.o.rowley@intel.com>
Thu, 4 Aug 2016 19:38:35 +0000 (14:38 -0500)
Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp

index 21b9e3f8c7d98747db5c5dee07698f87258d35de..00352580ab298f629973e180b5b910174dcf003d 100644 (file)
@@ -736,9 +736,9 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
 
 // templated backend function tables
 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][2][2][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2][2][2];
+extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
@@ -757,7 +757,6 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
         const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
         const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
-        const uint32_t inputCoverage = (psState.inputCoverage != SWR_INPUT_COVERAGE_NONE) ? 1 : 0;
      
         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
@@ -769,20 +768,20 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][inputCoverage][centroid][forcedSampleCount][canEarlyZ];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
             }
             else
             {
                 // always need to generate I & J per pixel for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
-                backendFuncs.pfnBackend = gBackendSingleSample[inputCoverage][centroid][canEarlyZ];
+                backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
             SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
             // always need to generate I & J per sample for Z interpolation
             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][inputCoverage][centroid][canEarlyZ];
+            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
             break;
         default:
             SWR_ASSERT(0 && "Invalid shading rate");
index b1e6c91871500d76775b16fae5e4afb9708c3947..92634b12f63d65c71ea0a8ff100715f0f51c4e78 100644 (file)
@@ -492,9 +492,11 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // pixel center
                 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
-                if(T::bInputCoverage)
+                if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
                 {
-                    generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : 
+                                                    &work.coverageMask[0];
+                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
                 }
 
                 RDTSC_START(BEBarycentric);
@@ -593,6 +595,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 Endtile:
             RDTSC_START(BEEndTile);
             coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
             pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
@@ -678,9 +684,11 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             CalcPixelBarycentrics(coeffs, psContext);
             RDTSC_STOP(BEBarycentric, 0, 0);
 
-            if(T::bInputCoverage)
+            if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
             {
-                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
+                                                &work.coverageMask[0];
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
             }
 
             if(T::bCentroidPos)
@@ -808,6 +816,10 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
             RDTSC_START(BEEndTile);
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
             pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
@@ -896,9 +908,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             CalcPixelBarycentrics(coeffs, psContext);
             RDTSC_STOP(BEBarycentric, 0, 0);
 
-            if (T::bInputCoverage)
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
             {
-                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask :
+                                                &work.coverageMask[0];
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask);
             }
 
             if(T::bCentroidPos)
@@ -1018,6 +1032,10 @@ Endtile:
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
 
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
@@ -1143,19 +1161,19 @@ void InitClearTilesTable()
 }
 
 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage
+PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
                                      [2] // centroid
                                      [2] // canEarlyZ
                                      = {};
 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
                                        [SWR_MSAA_SAMPLE_PATTERN_COUNT]
-                                       [2] // input coverage
+                                       [SWR_INPUT_COVERAGE_COUNT]
                                        [2] // centroid
                                        [2] // forcedSampleCount
                                        [2] // canEarlyZ
                                        = {};
 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                        [2] // input coverage
+                                        [SWR_INPUT_COVERAGE_COUNT]
                                         [2] // centroid
                                         [2] // canEarlyZ
                                         = {};
@@ -1195,6 +1213,22 @@ struct BEChooser
         }
     }
 
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+        default:
+        SWR_ASSERT(0 && "Invalid sample pattern\n");
+        return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+        break;
+        }
+    }
+
     // Recursively parse args
     template <typename... TArgsT>
     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
@@ -1226,29 +1260,29 @@ struct BEChooser
     }
 };
 
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2])
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
 {
-    for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
+    for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
     {
         for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
         {
             for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
             {
                 table[inputCoverage][isCentroid][canEarlyZ] =
-                    BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0),
+                    BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage,
                                          (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
             }
         }
     }
 }
 
-void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][2][2][2][2])
+void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2])
 {
     for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
     {
         for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++)
         {
-            for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
+            for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
             {
                 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
                 {
@@ -1257,7 +1291,7 @@ void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_CO
                         for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
                         {
                             table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
-                                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage > 0)
+                                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage
                                                         (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
                         }
                     }
@@ -1267,18 +1301,18 @@ void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_CO
     }
 }
 
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][2][2][2])
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
 {
     for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
     {
-        for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++)
+        for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
         {
             for(uint32_t centroid = 0; centroid < 2; centroid++)
             {
                 for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
                 {
                     table[sampleCount][inputCoverage][centroid][canEarlyZ] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0)
+                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage
                                              (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
                 }
             }
index 8a289c702653f2338e728b2587d45f1aeb39a949..c88247928911746aa770b9ad7627589be74d8768 100644 (file)
@@ -134,154 +134,184 @@ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
     return RasterTileStencilOffsets[sampleNum];
 }
 
-template<typename T>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+template<typename T, uint32_t InputCoverage>
+struct generateInputCoverage
 {
-
-    // will need to update for avx512
-    assert(KNOB_SIMD_WIDTH == 8);
-
-    __m256i mask[2];
-    __m256i sampleCoverage[2];
-    if(T::bIsStandardPattern)
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
     {
-        __m256i src = _mm256_set1_epi32(0);
-        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+        // will need to update for avx512
+        assert(KNOB_SIMD_WIDTH == 8);
 
-        if(T::MultisampleT::numSamples == 1)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-        }
-        else if(T::MultisampleT::numSamples == 2)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-        }
-        else if(T::MultisampleT::numSamples == 4)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-        }
-        else if(T::MultisampleT::numSamples == 8)
+        __m256i mask[2];
+        __m256i sampleCoverage[2];
+        if(T::bIsStandardPattern)
         {
-            mask[0] = _mm256_set1_epi32(-1);
+            __m256i src = _mm256_set1_epi32(0);
+            __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+            if(T::MultisampleT::numSamples == 1)
+            {
+                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+            }
+            else if(T::MultisampleT::numSamples == 2)
+            {
+                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+            }
+            else if(T::MultisampleT::numSamples == 4)
+            {
+                mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            }
+            else if(T::MultisampleT::numSamples == 8)
+            {
+                mask[0] = _mm256_set1_epi32(-1);
+            }
+            else if(T::MultisampleT::numSamples == 16)
+            {
+                mask[0] = _mm256_set1_epi32(-1);
+                mask[1] = _mm256_set1_epi32(-1);
+                index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+            }
+
+            // gather coverage for samples 0-7
+            sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+            if(T::MultisampleT::numSamples > 8)
+            {
+                // gather coverage for samples 8-15
+                sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+            }
         }
-        else if(T::MultisampleT::numSamples == 16)
+        else
         {
-            mask[0] = _mm256_set1_epi32(-1);
-            mask[1] = _mm256_set1_epi32(-1);
-            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+            // center coverage is the same for all samples; just broadcast to the sample slots
+            uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+            if(T::MultisampleT::numSamples == 1)
+            {
+                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 2)
+            {
+                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 4)
+            {
+                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 8)
+            {
+                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 16)
+            {
+                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+                sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+            }
         }
 
-        // gather coverage for samples 0-7
-        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+        mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+        // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+        __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+        __m256i packedCoverage1;
         if(T::MultisampleT::numSamples > 8)
         {
-            // gather coverage for samples 8-15
-            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+            packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
         }
-    }
-    else
-    {
-        // center coverage is the same for all samples; just broadcast to the sample slots
-        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-        if(T::MultisampleT::numSamples == 1)
+
+    #if (KNOB_ARCH == KNOB_ARCH_AVX)
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+        __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+        __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+        __m256i packedSampleCoverage;
+        if(T::MultisampleT::numSamples > 8)
         {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+            hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+            shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+            shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
         }
-        else if(T::MultisampleT::numSamples == 2)
+        else
         {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+            packedSampleCoverage = packedCoverage0;
         }
-        else if(T::MultisampleT::numSamples == 4)
+    #else
+        __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+        packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+        __m256i packedSampleCoverage;
+        if(T::MultisampleT::numSamples > 8)
         {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+            permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+            packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+            // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+            packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
         }
-        else if(T::MultisampleT::numSamples == 8)
+        else
         {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+            packedSampleCoverage = packedCoverage0;
         }
-        else if(T::MultisampleT::numSamples == 16)
+    #endif
+
+        for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
         {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
-        }
-    }
+            // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+            inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
 
-    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
-    // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+            if(!T::bForcedSampleCount)
+            {
+                // input coverage has to be anded with sample mask if MSAA isn't forced on
+                inputMask[i] &= sampleMask;
+            }
 
-    __m256i packedCoverage1;
-    if(T::MultisampleT::numSamples > 8)
-    {
-        // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
-        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+            // shift to the next pixel in the 4x2
+            packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+        }
     }
 
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-    __m256i packedSampleCoverage;
-    if(T::MultisampleT::numSamples > 8)
-    {
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-    }
-    else
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
     {
-        packedSampleCoverage = packedCoverage0;
+        uint32_t inputMask[KNOB_SIMD_WIDTH];
+        generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
+        inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
     }
-#else
-    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
 
-    __m256i packedSampleCoverage;
-    if(T::MultisampleT::numSamples > 8)
-    {
-        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+};
 
-        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-    }
-    else
+template<typename T>
+struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
+{
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
     {
-        packedSampleCoverage = packedCoverage0;
+        // will need to update for avx512
+        assert(KNOB_SIMD_WIDTH == 8);
+        __m256i vec = _mm256_set1_epi32(coverageMask[0]);
+        const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+        vec = _simd_and_si(vec, bit);
+        vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+        vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
+        inputCoverage = _simd_castsi_ps(vec);
     }
-#endif
 
-    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
     {
-        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
-        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-        if(!T::bForcedSampleCount)
+        unsigned long index;
+        uint32_t simdCoverage = (coverageMask[0] & MASK);
+        static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
+        while(_BitScanForward(&index, simdCoverage))
         {
-            // input coverage has to be anded with sample mask if MSAA isn't forced on
-            inputMask[i] &= sampleMask;
+            // set all samples to covered
+            inputMask[index] = FullCoverageMask;
         }
-
-        // shift to the next pixel in the 4x2
-        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
     }
-}
-
-template<typename T>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
-    generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
-    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
+};
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Centroid behaves exactly as follows :
@@ -298,7 +328,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const cov
                             const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
 {
     uint32_t inputMask[KNOB_SIMD_WIDTH];
-    generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
+    generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
 
     // Case (2) - partially covered pixel
 
@@ -592,7 +622,7 @@ template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SW
 struct SwrBackendTraits
 {
     static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
-    static const bool bInputCoverage = (coverage == 1);
+    static const uint32_t InputCoverage = coverage;
     static const bool bCentroidPos = (centroid == 1);
     static const bool bForcedSampleCount = (forced == 1);
     static const bool bCanEarlyZ = (canEarlyZ == 1);
index 1bc3938595c6e3b262cff4343f7ed8448534e6a5..1d8546959f587b5b8ea0acd8282f6c975f1af5c1 100644 (file)
@@ -131,9 +131,10 @@ typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT;
 /// default to standard rasterization behavior
 /// @tparam ConservativeT: type of conservative rasterization
 /// @tparam InputCoverageT: type of input coverage requested, if any
-template <typename ConservativeT, typename InputCoverageT>
+template <typename ConservativeT, typename _InputCoverageT>
 struct ConservativeRastBETraits {
     typedef std::false_type IsConservativeT;
+    typedef _InputCoverageT InputCoverageT;
     typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
     typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
     typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
@@ -141,10 +142,11 @@ struct ConservativeRastBETraits {
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief StandardRastT specialization of ConservativeRastBETraits
-template <typename InputCoverageT>
-struct ConservativeRastBETraits<StandardRastT, InputCoverageT>
+template <typename _InputCoverageT>
+struct ConservativeRastBETraits<StandardRastT, _InputCoverageT>
 {
     typedef std::false_type IsConservativeT;
+    typedef _InputCoverageT InputCoverageT;
     typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
     typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
     typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
@@ -206,8 +208,8 @@ struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT>
     /// intersects a pixel
     typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT;
 
-    /// offset edge towards from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
+    /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
     /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of 
     /// of having to compare individual edges to pixel corners to check if a pixel is fully covered by a triangle
-    typedef std::integral_constant<int32_t, static_cast<int32_t>(-((ConservativePrecisionT::ScaleT::value/2) + 1))> InnerConservativeEdgeOffsetT;
+    typedef std::integral_constant<int32_t, static_cast<int32_t>(-((ConservativePrecisionT::ScaleT::value/2) + 1) - ConservativeEdgeOffsetT::value)> InnerConservativeEdgeOffsetT;
 };
\ No newline at end of file
index 70472b4bf989299b673ff73c041865501b86b23c..56f9797576463b9fa180311b7f9548c6e1d88db9 100644 (file)
@@ -83,8 +83,7 @@ struct SWR_TRIANGLE_DESC
     float *pUserClipBuffer;
 
     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
-    uint64_t conservativeCoverageMask;
-    uint64_t innerConservativeCoverageMask;
+    uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
     uint64_t anyCoveredSamples;
 
     TRI_FLAGS triFlags;
index c5ef072de39dca559153cdc43eb4d3363ca4566a..3c5d73466e206ea5657e8bf9ce2b544367219610 100644 (file)
@@ -291,19 +291,9 @@ constexpr int64_t ManhToEdgePrecisionAdjust()
 /// the adjustEdgeConservative function. This struct should never
 /// be instantiated.
 /// @tparam RT: rasterizer traits
-/// @tparam IsConservativeT: is conservative rast enabled?
-template <typename RT, typename IsConservativeT>
+/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
+template <typename RT, typename ConservativeEdgeOffsetT>
 struct adjustEdgeConservative
-{
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) = delete;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief adjustEdgeConservative<RT, std::true_type> specialization 
-/// of adjustEdgeConservative. Used for conservative rasterization specific
-/// edge adjustments
-template <typename RT>
-struct adjustEdgeConservative<RT, std::true_type>
 {
     //////////////////////////////////////////////////////////////////////////
     /// @brief Performs calculations to adjust each edge of a triangle away
@@ -327,12 +317,12 @@ struct adjustEdgeConservative<RT, std::true_type>
         // 'fixed point' multiply (in double to be avx1 friendly) 
         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
-        __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value)), 
-                                     _mm256_mul_pd(vBai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value)));
+        __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
+                                     _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
 
         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
                       "Inadequate precision of result of manh calculation ");
-        
+
         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
         // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
@@ -345,14 +335,11 @@ struct adjustEdgeConservative<RT, std::true_type>
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief adjustEdgeConservative<RT, std::false_type> specialization 
-/// of adjustEdgeConservative. Allows code to be generically called; when
-/// IsConservativeT trait is disabled this inlines an empty function, which
-/// should get optimized out. 
+/// @brief adjustEdgeConservative specialization where no edge offset is needed
 template <typename RT>
-struct adjustEdgeConservative<RT, std::false_type>
+struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
 {
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge){};
+    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -369,7 +356,7 @@ constexpr int64_t ConservativeScissorOffset()
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a scalar edge out
+/// @brief Performs calculations to adjust each a vector of evaluated edges out
 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
 /// direction. 
 template <typename RT>
@@ -380,18 +367,47 @@ INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
 };
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Performs calculations to adjust each a scalar evaluated edge out
+/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+/// direction. 
+template <typename RT, typename OffsetT>
+INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
+{
+    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
+    int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
+    return (Edge - manh);
+};
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Perform any needed adjustments to evaluated triangle edges
-template <typename RT>
-INLINE void adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+template <typename RT, typename EdgeOffsetT>
+struct adjustEdgesFix16
 {
-    static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, 
-                  "Edge equation expected to be in x.16 fixed point");
-    // need to offset the edge before applying the top-left rule
-    adjustEdgeConservative<RT, typename RT::IsConservativeT>(vAi, vBi, vEdge);
+    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
+                      "Edge equation expected to be in x.16 fixed point");
 
-    adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-}
+        static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
+
+        // need to apply any edge offsets before applying the top-left rule
+        adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
+
+        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Perform top left adjustments to evaluated triangle edges
+template <typename RT>
+struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
+{
+    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+    }
+};
 
 // max(abs(dz/dx), abs(dz,dy)
 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
@@ -533,7 +549,7 @@ void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
 /// corner to sample position, and test for coverage
 /// @tparam sampleCount: multisample count
 template <typename NumSamplesT>
-INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d (&vEdgeFix16)[7],
+INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
                             int32_t &mask0, int32_t &mask1, int32_t &mask2)
 {
     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
@@ -550,7 +566,7 @@ INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d (&v
 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
 /// when only rasterizing a single coverage test point
 template <>
-INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d (&vEdgeFix16)[7],
+INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
                                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
 {
     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
@@ -722,6 +738,86 @@ INLINE bool TrivialAcceptTest<AllEdgesValidT>(const int mask0, const int mask1,
     return ((mask0 & mask1 & mask2) == 0xf);
 };
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for GenerateSVInnerCoverage. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct GenerateSVInnerCoverage
+{
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, EDGE*, double*,  uint64_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of GenerateSVInnerCoverage where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 
+/// edge values from OuterConservative to InnerConservative and rasterizes.
+template <typename RT>
+struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
+    {
+        double startQuadEdgesAdj[RT::NumEdgesT::value];
+        for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        {
+            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
+        }
+
+        // not trivial accept or reject, must rasterize full tile
+        RDTSC_START(BERasterizePartial);
+        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
+        RDTSC_STOP(BERasterizePartial, 0, 0);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct UpdateEdgeMasksInnerConservative
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
+                                           const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 
+/// evaluated at raster tile corners to inner conservative position and 
+/// updates edge masks
+template <typename RT>
+struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
+                                           const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
+    {
+        __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
+
+        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 
+        // conservative evaluated edge when adjusting the edge in for inner conservative tests
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
+
+        UpdateEdgeMasks<typename RT::NumRasterSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 
+/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 
+/// cover an entire raster tile, set mask0 to 0 to force it down the
+/// rastierizePartialTile path
+template <typename RT, typename ValidEdgeMaskT>
+struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
+                                   const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
+    {
+        // set one mask to zero to force the triangle down the rastierizePartialTile path
+        mask0 = 0;
+    }
+};
+
 template <typename RT>
 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
 {
@@ -963,8 +1059,8 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
     __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
 
-    // apply and edge adjustments(top-left, crast, etc)
-    adjustEdgesFix16<RT>(vAi, vBi, vEdge);
+    // apply any edge adjustments(top-left, crast, etc)
+    adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
 
     // broadcast respective edge results to all lanes
     double* pEdge = (double*)&vEdge;
@@ -1016,6 +1112,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
             __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+            // adjust for msaa tile bbox edges outward for conservative rast, if enabled
+            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
         }
     }
 
@@ -1056,11 +1155,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                 {
                     // trivial accept mask
                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
+
+                    // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
+                    UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
+                        (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
+
                     if (TrivialAcceptTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2))
                     {
-                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
                         // trivial accept, all 4 corners of all 3 edges are negative 
                         // i.e. raster tile completely inside triangle
+                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
+                        if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
+                        {
+                            triDesc.innerCoverageMask = 0xffffffffffffffffULL;
+                        }
                         RDTSC_EVENT(BETrivialAccept, 1, 0);
                     }
                     else
@@ -1104,6 +1212,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                         RDTSC_STOP(BERasterizePartial, 0, 0);
 
                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
+                        
+                        // Output SV InnerCoverage, if needed
+                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
                     }
                 }
                 else