swr: [rasterizer] Miscellaneous backend changes
authorTim Rowley <timothy.o.rowley@intel.com>
Sat, 30 Apr 2016 20:07:20 +0000 (14:07 -0600)
committerTim Rowley <timothy.o.rowley@intel.com>
Thu, 5 May 2016 19:49:48 +0000 (14:49 -0500)
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/common/simdintrin.h
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h

index 72fe15a3c7a286943eba9bb20c021b21523f32c0..5ec1f7193480e4d4ec4b02772632c6463dacb7a0 100644 (file)
@@ -139,6 +139,12 @@ __m256 _simdemu_permute_ps(__m256 a, __m256i b)
     return result;
 }
 
+INLINE
+__m256i _simdemu_permute_epi32(__m256i a, __m256i b)
+{
+    return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
+}
+
 INLINE
 __m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
 {
@@ -277,6 +283,7 @@ __m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
 #define _simd_movemask_epi8 _simdemu_movemask_epi8
 #define _simd_permute_ps _simdemu_permute_ps
+#define _simd_permute_epi32 _simdemu_permute_epi32
 #define _simd_srlv_epi32 _simdemu_srlv_epi32
 #define _simd_sllv_epi32 _simdemu_sllv_epi32
 
@@ -449,11 +456,18 @@ int _simdemu_movemask_epi8(__m256i a)
 #define _simd_permute_ps _mm256_permutevar8x32_ps
 #define _simd_srlv_epi32 _mm256_srlv_epi32
 #define _simd_sllv_epi32 _mm256_sllv_epi32
+
+INLINE
+simdscalari _simd_permute_epi32(simdscalari a, simdscalari index)
+{
+    return _simd_castps_si(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(a), index));
+}
 #endif
 
 #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
 #define _simd_shuffle_ps _mm256_shuffle_ps
 #define _simd_set1_epi32 _mm256_set1_epi32
+#define _simd_set_epi32 _mm256_set_epi32
 #define _simd_set1_epi8 _mm256_set1_epi8
 #define _simd_setzero_si _mm256_setzero_si256
 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
@@ -473,6 +487,12 @@ simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
     return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
 }
 
+INLINE
+simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
+{
+    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
+}
+
 // convert bitmask to vector mask
 INLINE
 simdscalar vMask(int32_t mask)
index d2547f398888b15f51cf410bca41c8a570e61f3f..376fb3f68cbb18225d55d505cb59ac63f7f2ea42 100644 (file)
@@ -886,7 +886,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
         psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
+            simdscalar activeLanes;
             if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+            activeLanes = vMask(work.anyCoveredSamples & MASK);
 
             psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
@@ -909,32 +911,22 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 RDTSC_STOP(BEBarycentric, 0, 0);
             }
 
-                       simdscalar activeLanes;
             if(T::bForcedSampleCount)
             {
                 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
                 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
-                activeLanes = _simd_and_ps(vMask(work.anyCoveredSamples & MASK), vSampleMask);
+                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
             }
 
             // Early-Z?
             if(T::bCanEarlyZ && !T::bForcedSampleCount)
             {
-                activeLanes = _simd_setzero_ps();
                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
                 UPDATE_STAT(DepthPassCount, depthPassCount);
             }
-            // if we can't do early z, set the active mask to any samples covered in the current simd
-            else if(!T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                activeLanes = vMask(work.anyCoveredSamples & MASK);
-            }
 
             // if we have no covered samples that passed depth at this point, go to next tile
-            if(!_simd_movemask_ps(activeLanes))
-            {
-                goto Endtile;
-            }
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
 
             if(pPSState->usesSourceDepth)
             {
@@ -957,10 +949,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
             // update active lanes to remove any discarded or oMask'd pixels
             activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
-            if(!_simd_movemask_ps(activeLanes))
-            {
-                goto Endtile;
-            }
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
 
             // late-Z
             if(!T::bCanEarlyZ && !T::bForcedSampleCount)
@@ -970,10 +959,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             }
 
             // if we have no covered samples that passed depth at this point, skip OM and go to next tile
-            if(!_simd_movemask_ps(activeLanes))
-            {
-                goto Endtile;
-            }
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
 
             // output merger
             // loop over all samples, broadcasting the results of the PS to all passing pixels
index 24ba69ec87aa84cd838b060fba2ab0d4683d48a4..2c110416805744a8aa9e08f4e0dda3b6dd6b1a0e 100644 (file)
@@ -423,14 +423,15 @@ struct PixelRateZTestLoop
                        clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
            
     INLINE
-    uint32_t operator()(simdscalar& anyDepthSamplePassed, SWR_PS_CONTEXT& psContext, 
+    uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
                         const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
     {
         uint32_t statCount = 0;
+        simdscalar anyDepthSamplePassed = _simd_setzero_ps();
         for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
         {
             const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
-            vCoverageMask[sample] = vMask(pCoverageMask[currentSimdIn8x8] & MASK);
+            vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
 
             if(!_simd_movemask_ps(vCoverageMask[sample]))
             {
@@ -494,6 +495,8 @@ struct PixelRateZTestLoop
             uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
             statCount += _mm_popcnt_u32(statMask);
         }
+
+        activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
         // return number of samples that passed depth and coverage
         return statCount;
     }