return result;
}
+INLINE
+__m256i _simdemu_permute_epi32(__m256i a, __m256i b)
+{
+ return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
+}
+
INLINE
__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
{
#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
#define _simd_movemask_epi8 _simdemu_movemask_epi8
#define _simd_permute_ps _simdemu_permute_ps
+#define _simd_permute_epi32 _simdemu_permute_epi32
#define _simd_srlv_epi32 _simdemu_srlv_epi32
#define _simd_sllv_epi32 _simdemu_sllv_epi32
#define _simd_permute_ps _mm256_permutevar8x32_ps
#define _simd_srlv_epi32 _mm256_srlv_epi32
#define _simd_sllv_epi32 _mm256_sllv_epi32
+
+INLINE
+simdscalari _simd_permute_epi32(simdscalari a, simdscalari index)
+{
+ return _simd_castps_si(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(a), index));
+}
#endif
#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
#define _simd_shuffle_ps _mm256_shuffle_ps
#define _simd_set1_epi32 _mm256_set1_epi32
+#define _simd_set_epi32 _mm256_set_epi32
#define _simd_set1_epi8 _mm256_set1_epi8
#define _simd_setzero_si _mm256_setzero_si256
#define _simd_cvttps_epi32 _mm256_cvttps_epi32
return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
}
+INLINE
+simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
+{
+ return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
+}
+
// convert bitmask to vector mask
INLINE
simdscalar vMask(int32_t mask)
psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
+ simdscalar activeLanes;
if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+ activeLanes = vMask(work.anyCoveredSamples & MASK);
psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// set pixel center positions
RDTSC_STOP(BEBarycentric, 0, 0);
}
- simdscalar activeLanes;
if(T::bForcedSampleCount)
{
// candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
- activeLanes = _simd_and_ps(vMask(work.anyCoveredSamples & MASK), vSampleMask);
+ activeLanes = _simd_and_ps(activeLanes, vSampleMask);
}
// Early-Z?
if(T::bCanEarlyZ && !T::bForcedSampleCount)
{
- activeLanes = _simd_setzero_ps();
uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
UPDATE_STAT(DepthPassCount, depthPassCount);
}
- // if we can't do early z, set the active mask to any samples covered in the current simd
- else if(!T::bCanEarlyZ && !T::bForcedSampleCount)
- {
- activeLanes = vMask(work.anyCoveredSamples & MASK);
- }
// if we have no covered samples that passed depth at this point, go to next tile
- if(!_simd_movemask_ps(activeLanes))
- {
- goto Endtile;
- }
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
if(pPSState->usesSourceDepth)
{
// update active lanes to remove any discarded or oMask'd pixels
activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
- if(!_simd_movemask_ps(activeLanes))
- {
- goto Endtile;
- }
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
// late-Z
if(!T::bCanEarlyZ && !T::bForcedSampleCount)
}
// if we have no covered samples that passed depth at this point, skip OM and go to next tile
- if(!_simd_movemask_ps(activeLanes))
- {
- goto Endtile;
- }
+ if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
// output merger
// loop over all samples, broadcasting the results of the PS to all passing pixels
clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
INLINE
- uint32_t operator()(simdscalar& anyDepthSamplePassed, SWR_PS_CONTEXT& psContext,
+ uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext,
const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
{
uint32_t statCount = 0;
+ simdscalar anyDepthSamplePassed = _simd_setzero_ps();
for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
{
const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
- vCoverageMask[sample] = vMask(pCoverageMask[currentSimdIn8x8] & MASK);
+ vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
if(!_simd_movemask_ps(vCoverageMask[sample]))
{
uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
statCount += _mm_popcnt_u32(statMask);
}
+
+ activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
// return number of samples that passed depth and coverage
return statCount;
}